import time
from collections import Counter
import matplotlib.pyplot as plt
import plotly.express as px
import statistics
import numpy as np
import pandas as pd
import seaborn as sns
from pandas_profiling import ProfileReport
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector, VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import explained_variance_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import f1_score
# config plot
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)
# random state value
random_state = 42
%matplotlib inline
dt = pd.read_csv("train.csv", sep=",")
df = pd.DataFrame(dt)
df
| number_of_elements | mean_atomic_mass | wtd_mean_atomic_mass | gmean_atomic_mass | wtd_gmean_atomic_mass | entropy_atomic_mass | wtd_entropy_atomic_mass | range_atomic_mass | wtd_range_atomic_mass | std_atomic_mass | wtd_std_atomic_mass | mean_fie | wtd_mean_fie | gmean_fie | wtd_gmean_fie | entropy_fie | wtd_entropy_fie | range_fie | wtd_range_fie | std_fie | wtd_std_fie | mean_atomic_radius | wtd_mean_atomic_radius | gmean_atomic_radius | wtd_gmean_atomic_radius | entropy_atomic_radius | wtd_entropy_atomic_radius | range_atomic_radius | wtd_range_atomic_radius | std_atomic_radius | wtd_std_atomic_radius | mean_Density | wtd_mean_Density | gmean_Density | wtd_gmean_Density | entropy_Density | wtd_entropy_Density | range_Density | wtd_range_Density | std_Density | wtd_std_Density | mean_ElectronAffinity | wtd_mean_ElectronAffinity | gmean_ElectronAffinity | wtd_gmean_ElectronAffinity | entropy_ElectronAffinity | wtd_entropy_ElectronAffinity | range_ElectronAffinity | wtd_range_ElectronAffinity | std_ElectronAffinity | wtd_std_ElectronAffinity | mean_FusionHeat | wtd_mean_FusionHeat | gmean_FusionHeat | wtd_gmean_FusionHeat | entropy_FusionHeat | wtd_entropy_FusionHeat | range_FusionHeat | wtd_range_FusionHeat | std_FusionHeat | wtd_std_FusionHeat | mean_ThermalConductivity | wtd_mean_ThermalConductivity | gmean_ThermalConductivity | wtd_gmean_ThermalConductivity | entropy_ThermalConductivity | wtd_entropy_ThermalConductivity | range_ThermalConductivity | wtd_range_ThermalConductivity | std_ThermalConductivity | wtd_std_ThermalConductivity | mean_Valence | wtd_mean_Valence | gmean_Valence | wtd_gmean_Valence | entropy_Valence | wtd_entropy_Valence | range_Valence | wtd_range_Valence | std_Valence | wtd_std_Valence | critical_temp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4 | 88.944468 | 57.862692 | 66.361592 | 36.116612 | 1.181795 | 1.062396 | 122.90607 | 31.794921 | 51.968828 | 53.622535 | 775.425000 | 1010.268571 | 718.152900 | 938.016780 | 1.305967 | 0.791488 | 810.6 | 735.985714 | 323.811808 | 355.562967 | 160.250000 | 105.514286 | 136.126003 | 84.528423 | 1.259244 | 1.207040 | 205 | 42.914286 | 75.237540 | 69.235569 | 4654.35725 | 2961.502286 | 724.953211 | 53.543811 | 1.033129 | 0.814598 | 8958.571 | 1579.583429 | 3306.162897 | 3572.596624 | 81.837500 | 111.727143 | 60.123179 | 99.414682 | 1.159687 | 0.787382 | 127.05 | 80.987143 | 51.433712 | 42.558396 | 6.905500 | 3.846857 | 3.479475 | 1.040986 | 1.088575 | 0.994998 | 12.878 | 1.744571 | 4.599064 | 4.666920 | 107.756645 | 61.015189 | 7.062488 | 0.621979 | 0.308148 | 0.262848 | 399.97342 | 57.127669 | 168.854244 | 138.517163 | 2.25 | 2.257143 | 2.213364 | 2.219783 | 1.368922 | 1.066221 | 1 | 1.085714 | 0.433013 | 0.437059 | 29.00 |
| 1 | 5 | 92.729214 | 58.518416 | 73.132787 | 36.396602 | 1.449309 | 1.057755 | 122.90607 | 36.161939 | 47.094633 | 53.979870 | 766.440000 | 1010.612857 | 720.605511 | 938.745413 | 1.544145 | 0.807078 | 810.6 | 743.164286 | 290.183029 | 354.963511 | 161.200000 | 104.971429 | 141.465215 | 84.370167 | 1.508328 | 1.204115 | 205 | 50.571429 | 67.321319 | 68.008817 | 5821.48580 | 3021.016571 | 1237.095080 | 54.095718 | 1.314442 | 0.914802 | 10488.571 | 1667.383429 | 3767.403176 | 3632.649185 | 90.890000 | 112.316429 | 69.833315 | 101.166398 | 1.427997 | 0.838666 | 127.05 | 81.207857 | 49.438167 | 41.667621 | 7.784400 | 3.796857 | 4.403790 | 1.035251 | 1.374977 | 1.073094 | 12.878 | 1.595714 | 4.473363 | 4.603000 | 172.205316 | 61.372331 | 16.064228 | 0.619735 | 0.847404 | 0.567706 | 429.97342 | 51.413383 | 198.554600 | 139.630922 | 2.00 | 2.257143 | 1.888175 | 2.210679 | 1.557113 | 1.047221 | 2 | 1.128571 | 0.632456 | 0.468606 | 26.00 |
| 2 | 4 | 88.944468 | 57.885242 | 66.361592 | 36.122509 | 1.181795 | 0.975980 | 122.90607 | 35.741099 | 51.968828 | 53.656268 | 775.425000 | 1010.820000 | 718.152900 | 939.009036 | 1.305967 | 0.773620 | 810.6 | 743.164286 | 323.811808 | 354.804183 | 160.250000 | 104.685714 | 136.126003 | 84.214573 | 1.259244 | 1.132547 | 205 | 49.314286 | 75.237540 | 67.797712 | 4654.35725 | 2999.159429 | 724.953211 | 53.974022 | 1.033129 | 0.760305 | 8958.571 | 1667.383429 | 3306.162897 | 3592.019281 | 81.837500 | 112.213571 | 60.123179 | 101.082152 | 1.159687 | 0.786007 | 127.05 | 81.207857 | 51.433712 | 41.639878 | 6.905500 | 3.822571 | 3.479475 | 1.037439 | 1.088575 | 0.927479 | 12.878 | 1.757143 | 4.599064 | 4.649635 | 107.756645 | 60.943760 | 7.062488 | 0.619095 | 0.308148 | 0.250477 | 399.97342 | 57.127669 | 168.854244 | 138.540613 | 2.25 | 2.271429 | 2.213364 | 2.232679 | 1.368922 | 1.029175 | 1 | 1.114286 | 0.433013 | 0.444697 | 19.00 |
| 3 | 4 | 88.944468 | 57.873967 | 66.361592 | 36.119560 | 1.181795 | 1.022291 | 122.90607 | 33.768010 | 51.968828 | 53.639405 | 775.425000 | 1010.544286 | 718.152900 | 938.512777 | 1.305967 | 0.783207 | 810.6 | 739.575000 | 323.811808 | 355.183884 | 160.250000 | 105.100000 | 136.126003 | 84.371352 | 1.259244 | 1.173033 | 205 | 46.114286 | 75.237540 | 68.521665 | 4654.35725 | 2980.330857 | 724.953211 | 53.758486 | 1.033129 | 0.788889 | 8958.571 | 1623.483429 | 3306.162897 | 3582.370597 | 81.837500 | 111.970357 | 60.123179 | 100.244950 | 1.159687 | 0.786900 | 127.05 | 81.097500 | 51.433712 | 42.102344 | 6.905500 | 3.834714 | 3.479475 | 1.039211 | 1.088575 | 0.964031 | 12.878 | 1.744571 | 4.599064 | 4.658301 | 107.756645 | 60.979474 | 7.062488 | 0.620535 | 0.308148 | 0.257045 | 399.97342 | 57.127669 | 168.854244 | 138.528893 | 2.25 | 2.264286 | 2.213364 | 2.226222 | 1.368922 | 1.048834 | 1 | 1.100000 | 0.433013 | 0.440952 | 22.00 |
| 4 | 4 | 88.944468 | 57.840143 | 66.361592 | 36.110716 | 1.181795 | 1.129224 | 122.90607 | 27.848743 | 51.968828 | 53.588771 | 775.425000 | 1009.717143 | 718.152900 | 937.025573 | 1.305967 | 0.805230 | 810.6 | 728.807143 | 323.811808 | 356.319281 | 160.250000 | 106.342857 | 136.126003 | 84.843442 | 1.259244 | 1.261194 | 205 | 36.514286 | 75.237540 | 70.634448 | 4654.35725 | 2923.845143 | 724.953211 | 53.117029 | 1.033129 | 0.859811 | 8958.571 | 1491.783429 | 3306.162897 | 3552.668664 | 81.837500 | 111.240714 | 60.123179 | 97.774719 | 1.159687 | 0.787396 | 127.05 | 80.766429 | 51.433712 | 43.452059 | 6.905500 | 3.871143 | 3.479475 | 1.044545 | 1.088575 | 1.044970 | 12.878 | 1.744571 | 4.599064 | 4.684014 | 107.756645 | 61.086617 | 7.062488 | 0.624878 | 0.308148 | 0.272820 | 399.97342 | 57.127669 | 168.854244 | 138.493671 | 2.25 | 2.242857 | 2.213364 | 2.206963 | 1.368922 | 1.096052 | 1 | 1.057143 | 0.433013 | 0.428809 | 23.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21258 | 4 | 106.957877 | 53.095769 | 82.515384 | 43.135565 | 1.177145 | 1.254119 | 146.88130 | 15.504479 | 65.764081 | 43.202659 | 661.775000 | 753.793333 | 651.611213 | 750.570867 | 1.371139 | 0.927050 | 273.0 | 427.546667 | 114.383355 | 64.428777 | 176.500000 | 138.244444 | 169.947614 | 134.233861 | 1.350879 | 1.112222 | 111 | 57.808889 | 45.883003 | 35.811330 | 7341.25000 | 4963.928889 | 6404.741690 | 4082.735787 | 1.284617 | 1.110551 | 7511.000 | 2449.715556 | 2981.637585 | 2978.412680 | 63.825000 | 86.511111 | 49.825259 | 61.057784 | 1.164382 | 0.511822 | 117.90 | 74.140000 | 43.467883 | 55.219027 | 25.700000 | 34.448000 | 22.494622 | 29.041737 | 1.245563 | 0.631341 | 36.400 | 27.497778 | 14.446107 | 17.657417 | 65.500000 | 111.537778 | 42.371302 | 95.001493 | 1.029002 | 0.634332 | 134.00000 | 83.048889 | 55.056789 | 46.595943 | 3.25 | 3.555556 | 3.223710 | 3.519911 | 1.377820 | 0.913658 | 1 | 2.168889 | 0.433013 | 0.496904 | 2.44 |
| 21259 | 5 | 92.266740 | 49.021367 | 64.812662 | 32.867748 | 1.323287 | 1.571630 | 188.38390 | 7.353333 | 69.232655 | 50.148287 | 747.780000 | 989.819048 | 702.115184 | 923.426093 | 1.541006 | 0.988470 | 810.6 | 659.771429 | 293.286136 | 345.450969 | 159.200000 | 112.000000 | 139.743372 | 89.633687 | 1.507321 | 1.536691 | 205 | 20.285714 | 67.211309 | 72.795212 | 5174.28580 | 2827.415190 | 962.364248 | 66.286408 | 1.165065 | 1.080432 | 11848.571 | 1705.918143 | 4506.895480 | 3937.105612 | 60.484000 | 100.490952 | 29.177546 | 60.814837 | 1.112639 | 0.693424 | 138.63 | 74.090000 | 58.434336 | 57.634544 | 6.812400 | 4.793429 | 3.839585 | 1.315769 | 1.345508 | 1.203223 | 12.878 | 2.378952 | 4.340205 | 5.204855 | 132.805316 | 108.680590 | 17.747413 | 1.577047 | 0.949904 | 0.745515 | 399.97342 | 76.176553 | 151.164639 | 156.558695 | 2.20 | 2.047619 | 2.168944 | 2.038991 | 1.594167 | 1.337246 | 1 | 0.904762 | 0.400000 | 0.212959 | 122.10 |
| 21260 | 2 | 99.663190 | 95.609104 | 99.433882 | 95.464320 | 0.690847 | 0.530198 | 13.51362 | 53.041104 | 6.756810 | 5.405448 | 733.550000 | 691.580000 | 730.207231 | 689.480961 | 0.688594 | 0.542120 | 139.9 | 370.180000 | 69.950000 | 55.960000 | 183.500000 | 192.200000 | 182.926215 | 191.826893 | 0.690022 | 0.465055 | 29 | 124.600000 | 14.500000 | 11.600000 | 10296.50000 | 9260.600000 | 10150.719679 | 9170.377777 | 0.679023 | 0.572700 | 3453.000 | 4451.400000 | 1726.500000 | 1381.200000 | 71.400000 | 81.120000 | 69.537903 | 79.871364 | 0.667182 | 0.397810 | 32.40 | 59.040000 | 16.200000 | 12.960000 | 21.750000 | 24.780000 | 21.155614 | 24.380978 | 0.665945 | 0.395385 | 10.100 | 18.100000 | 5.050000 | 4.040000 | 62.500000 | 57.400000 | 61.919302 | 57.038314 | 0.683870 | 0.559446 | 17.00000 | 29.000000 | 8.500000 | 6.800000 | 4.50 | 4.800000 | 4.472136 | 4.781762 | 0.686962 | 0.450561 | 1 | 3.200000 | 0.500000 | 0.400000 | 1.98 |
| 21261 | 2 | 99.663190 | 97.095602 | 99.433882 | 96.901083 | 0.690847 | 0.640883 | 13.51362 | 31.115202 | 6.756810 | 6.249958 | 733.550000 | 706.969000 | 730.207231 | 704.143255 | 0.688594 | 0.648876 | 139.9 | 208.799000 | 69.950000 | 64.702805 | 183.500000 | 189.010000 | 182.926215 | 188.514109 | 0.690022 | 0.590271 | 29 | 84.230000 | 14.500000 | 13.412304 | 10296.50000 | 9640.430000 | 10150.719679 | 9518.329826 | 0.679023 | 0.667210 | 3453.000 | 2186.170000 | 1726.500000 | 1596.989169 | 71.400000 | 77.556000 | 69.537903 | 75.915236 | 0.667182 | 0.527718 | 32.40 | 43.332000 | 16.200000 | 14.984781 | 21.750000 | 23.669000 | 21.155614 | 23.144890 | 0.665945 | 0.525289 | 10.100 | 13.315000 | 5.050000 | 4.671182 | 62.500000 | 59.270000 | 61.919302 | 58.781651 | 0.683870 | 0.659671 | 17.00000 | 15.250000 | 8.500000 | 7.862385 | 4.50 | 4.690000 | 4.472136 | 4.665819 | 0.686962 | 0.577601 | 1 | 2.210000 | 0.500000 | 0.462493 | 1.84 |
| 21262 | 3 | 87.468333 | 86.858500 | 82.555758 | 80.458722 | 1.041270 | 0.895229 | 71.75500 | 43.144000 | 29.905282 | 33.927941 | 856.166667 | 821.190000 | 852.858789 | 818.631908 | 1.094784 | 0.968771 | 181.3 | 285.510000 | 74.569624 | 65.291691 | 127.333333 | 137.500000 | 125.493812 | 136.084313 | 1.084025 | 0.883461 | 53 | 67.700000 | 21.853045 | 19.345542 | 6311.00000 | 6914.900000 | 6186.508901 | 6830.731801 | 1.078970 | 0.874139 | 3055.000 | 3455.100000 | 1248.208583 | 1039.909655 | 135.133333 | 104.930000 | 86.530864 | 57.563783 | 0.839248 | 0.747947 | 179.30 | 68.080000 | 83.414480 | 87.740504 | 12.233333 | 14.440000 | 10.925390 | 13.816376 | 1.002899 | 0.826780 | 12.100 | 6.460000 | 5.062498 | 3.481724 | 27.506667 | 40.752000 | 4.976422 | 12.919996 | 0.194158 | 0.142553 | 78.48000 | 39.448000 | 36.425359 | 38.254432 | 5.00 | 4.500000 | 4.762203 | 4.242641 | 1.054920 | 0.970116 | 3 | 1.800000 | 1.414214 | 1.500000 | 12.80 |
21263 rows × 82 columns
# generating profile report of the whole dataset - FIRST RUN ONLY
'''
profile = ProfileReport(df, title="PD_TPC2_Superconductors")
profile.to_file("appendix/PD_TPC2_Superconductors.html")
'''
'\nprofile = ProfileReport(df, title="PD_TPC2_Superconductors")\nprofile.to_file("appendix/PD_TPC2_Superconductors.html")\n'
# data types
df.dtypes
number_of_elements int64
mean_atomic_mass float64
wtd_mean_atomic_mass float64
gmean_atomic_mass float64
wtd_gmean_atomic_mass float64
...
range_Valence int64
wtd_range_Valence float64
std_Valence float64
wtd_std_Valence float64
critical_temp float64
Length: 82, dtype: object
# missing values
df.isnull().sum()
number_of_elements 0
mean_atomic_mass 0
wtd_mean_atomic_mass 0
gmean_atomic_mass 0
wtd_gmean_atomic_mass 0
..
range_Valence 0
wtd_range_Valence 0
std_Valence 0
wtd_std_Valence 0
critical_temp 0
Length: 82, dtype: int64
# cheking for duplicates
Counter(df.duplicated())
Counter({False: 21197, True: 66})
# drop duplicates
df = df.drop_duplicates()
# cheking for duplicates
Counter(df.duplicated())
Counter({False: 21197})
Summary the dataset after deleting duplicates
# describe dataset
df.describe()
| number_of_elements | mean_atomic_mass | wtd_mean_atomic_mass | gmean_atomic_mass | wtd_gmean_atomic_mass | entropy_atomic_mass | wtd_entropy_atomic_mass | range_atomic_mass | wtd_range_atomic_mass | std_atomic_mass | wtd_std_atomic_mass | mean_fie | wtd_mean_fie | gmean_fie | wtd_gmean_fie | entropy_fie | wtd_entropy_fie | range_fie | wtd_range_fie | std_fie | wtd_std_fie | mean_atomic_radius | wtd_mean_atomic_radius | gmean_atomic_radius | wtd_gmean_atomic_radius | entropy_atomic_radius | wtd_entropy_atomic_radius | range_atomic_radius | wtd_range_atomic_radius | std_atomic_radius | wtd_std_atomic_radius | mean_Density | wtd_mean_Density | gmean_Density | wtd_gmean_Density | entropy_Density | wtd_entropy_Density | range_Density | wtd_range_Density | std_Density | wtd_std_Density | mean_ElectronAffinity | wtd_mean_ElectronAffinity | gmean_ElectronAffinity | wtd_gmean_ElectronAffinity | entropy_ElectronAffinity | wtd_entropy_ElectronAffinity | range_ElectronAffinity | wtd_range_ElectronAffinity | std_ElectronAffinity | wtd_std_ElectronAffinity | mean_FusionHeat | wtd_mean_FusionHeat | gmean_FusionHeat | wtd_gmean_FusionHeat | entropy_FusionHeat | wtd_entropy_FusionHeat | range_FusionHeat | wtd_range_FusionHeat | std_FusionHeat | wtd_std_FusionHeat | mean_ThermalConductivity | wtd_mean_ThermalConductivity | gmean_ThermalConductivity | wtd_gmean_ThermalConductivity | entropy_ThermalConductivity | wtd_entropy_ThermalConductivity | range_ThermalConductivity | wtd_range_ThermalConductivity | std_ThermalConductivity | wtd_std_ThermalConductivity | mean_Valence | wtd_mean_Valence | gmean_Valence | wtd_gmean_Valence | entropy_Valence | wtd_entropy_Valence | range_Valence | wtd_range_Valence | std_Valence | wtd_std_Valence | critical_temp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 | 21197.000000 |
| mean | 4.120300 | 87.544201 | 72.945331 | 71.253164 | 58.474874 | 1.166782 | 1.065103 | 115.751945 | 33.204053 | 44.436401 | 41.490888 | 769.763322 | 870.845514 | 737.547500 | 833.083589 | 1.300496 | 0.927436 | 573.413988 | 483.935313 | 216.035193 | 224.476880 | 157.968942 | 134.637733 | 144.407310 | 120.880069 | 1.269041 | 1.132479 | 139.552012 | 51.290519 | 51.669299 | 52.410878 | 6104.949112 | 5259.357685 | 3448.167503 | 3104.482658 | 1.073272 | 0.856543 | 8673.258150 | 2900.587194 | 3418.704041 | 3321.751640 | 76.866669 | 92.749248 | 54.307039 | 72.413486 | 1.071182 | 0.771238 | 120.868857 | 59.383115 | 48.956611 | 44.445791 | 14.259618 | 13.798017 | 10.102742 | 10.097794 | 1.094502 | 0.915214 | 21.109508 | 8.183450 | 8.306107 | 7.698272 | 89.762371 | 81.597376 | 29.765853 | 27.233552 | 0.727696 | 0.539749 | 251.401535 | 62.118953 | 99.127605 | 96.419168 | 3.194431 | 3.148856 | 3.052542 | 3.051485 | 1.297020 | 1.053977 | 2.042553 | 1.480890 | 0.839692 | 0.674168 | 34.494532 |
| std | 1.438282 | 29.643136 | 33.460789 | 30.990774 | 36.615381 | 0.364712 | 0.401225 | 54.584784 | 26.966285 | 20.019732 | 19.973773 | 87.486110 | 143.208996 | 78.345068 | 119.722906 | 0.381597 | 0.334107 | 309.205105 | 224.068070 | 109.820525 | 127.811013 | 20.151776 | 28.777408 | 22.080187 | 35.801133 | 0.375109 | 0.406876 | 67.202224 | 34.993536 | 22.876341 | 25.271459 | 2841.463086 | 3216.978402 | 3697.026215 | 3970.072718 | 0.342333 | 0.319808 | 4093.005329 | 2398.872994 | 1671.437273 | 1609.618632 | 27.695062 | 32.282257 | 29.000902 | 31.666733 | 0.343299 | 0.286020 | 58.679836 | 28.598854 | 21.728055 | 20.417285 | 11.277476 | 14.236516 | 10.049735 | 13.116642 | 0.375772 | 0.369882 | 20.321781 | 11.360991 | 8.645704 | 7.240884 | 38.514224 | 45.554545 | 34.054177 | 40.207155 | 0.326196 | 0.318411 | 158.612185 | 43.135044 | 60.108851 | 63.694958 | 1.043256 | 1.189878 | 1.044637 | 1.173221 | 0.392834 | 0.380121 | 1.243242 | 0.977545 | 0.485002 | 0.456038 | 34.276465 |
| min | 1.000000 | 6.941000 | 6.423452 | 5.320573 | 1.960849 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 375.500000 | 375.500000 | 375.500000 | 375.500000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 48.000000 | 48.000000 | 48.000000 | 48.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.429000 | 1.429000 | 1.429000 | 0.686245 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.500000 | 1.500000 | 1.500000 | 1.500000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.222000 | 0.222000 | 0.222000 | 0.222000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.026580 | 0.026580 | 0.026580 | 0.022952 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000210 |
| 25% | 3.000000 | 72.522475 | 52.143839 | 58.041225 | 35.247208 | 0.972995 | 0.781227 | 78.899680 | 16.799962 | 32.890369 | 28.546294 | 723.740000 | 739.184286 | 692.541331 | 720.169563 | 1.086113 | 0.756078 | 268.700000 | 291.268750 | 114.800000 | 94.070916 | 149.333333 | 112.120370 | 133.542493 | 89.204853 | 1.067279 | 0.857598 | 81.000000 | 28.566667 | 35.524639 | 32.270972 | 4513.500000 | 2998.573742 | 883.117278 | 66.735645 | 0.914705 | 0.689062 | 6648.000000 | 1656.489126 | 2832.147163 | 2571.811460 | 62.090000 | 73.395504 | 33.700512 | 50.763699 | 0.894571 | 0.661414 | 87.350000 | 34.074637 | 38.625000 | 33.540079 | 7.588667 | 5.030667 | 4.109878 | 1.321903 | 0.839985 | 0.674190 | 12.878000 | 2.325831 | 4.261340 | 4.603000 | 61.000000 | 54.250000 | 8.339818 | 1.087095 | 0.457810 | 0.250196 | 87.000000 | 29.440000 | 38.002924 | 31.999086 | 2.333333 | 2.116279 | 2.267933 | 2.090896 | 1.060857 | 0.778998 | 1.000000 | 0.921250 | 0.451754 | 0.306892 | 5.380000 |
| 50% | 4.000000 | 84.922750 | 60.689236 | 66.361592 | 39.873869 | 1.199541 | 1.148465 | 122.906070 | 26.601500 | 45.123500 | 44.290157 | 765.200000 | 890.787826 | 728.058342 | 856.752075 | 1.359251 | 0.917130 | 764.100000 | 511.750000 | 266.779102 | 258.776561 | 160.250000 | 125.873918 | 142.807563 | 113.170060 | 1.332739 | 1.244600 | 171.000000 | 42.960000 | 58.663106 | 60.139975 | 5329.085800 | 4285.625500 | 1338.878207 | 1500.233024 | 1.090610 | 0.883398 | 8958.571000 | 2082.956581 | 3303.278324 | 3626.410300 | 73.100000 | 102.960846 | 51.437518 | 73.173958 | 1.138284 | 0.781612 | 127.050000 | 71.159700 | 51.125720 | 48.046579 | 9.304400 | 8.303077 | 5.237518 | 4.897999 | 1.112098 | 0.996004 | 12.878000 | 3.424434 | 4.948155 | 5.498409 | 96.605316 | 73.447753 | 14.287643 | 6.089571 | 0.738694 | 0.545019 | 399.973420 | 56.556240 | 135.782279 | 115.035273 | 2.833333 | 2.605455 | 2.615321 | 2.428047 | 1.368922 | 1.169665 | 2.000000 | 1.062857 | 0.800000 | 0.500000 | 20.000000 |
| 75% | 5.000000 | 100.374260 | 85.982703 | 78.019689 | 73.056021 | 1.444537 | 1.360388 | 154.119320 | 38.336200 | 59.356406 | 53.634945 | 796.320000 | 1004.170714 | 765.715174 | 937.613187 | 1.551446 | 1.061987 | 810.600000 | 690.784000 | 297.946646 | 342.678223 | 169.800000 | 158.206107 | 155.938199 | 150.840017 | 1.512410 | 1.426131 | 205.000000 | 60.000000 | 69.424491 | 73.810265 | 6718.285800 | 6403.916667 | 5770.285523 | 5737.162403 | 1.323930 | 1.081805 | 9778.571000 | 3395.786667 | 4004.273231 | 3959.281462 | 85.325000 | 110.738462 | 67.019848 | 89.995989 | 1.347292 | 0.877792 | 138.630000 | 76.706965 | 56.322245 | 53.318555 | 17.050000 | 18.460000 | 13.573422 | 16.410484 | 1.378110 | 1.157944 | 23.200000 | 10.447600 | 9.038106 | 8.015532 | 111.005316 | 99.075851 | 41.833001 | 47.107534 | 0.962398 | 0.777359 | 399.973420 | 91.876282 | 153.876488 | 162.810294 | 4.000000 | 4.007317 | 3.662842 | 3.909523 | 1.589027 | 1.331531 | 3.000000 | 1.910400 | 1.200000 | 1.021023 | 63.000000 |
| max | 9.000000 | 208.980400 | 208.980400 | 208.980400 | 208.980400 | 1.983797 | 1.958203 | 207.972460 | 205.589910 | 101.019700 | 101.019700 | 1313.100000 | 1348.028986 | 1313.100000 | 1327.593381 | 2.157777 | 2.038560 | 1304.500000 | 1251.855072 | 499.671949 | 479.162305 | 298.000000 | 298.000000 | 298.000000 | 298.000000 | 2.141961 | 1.903748 | 256.000000 | 240.164344 | 115.500000 | 97.140711 | 22590.000000 | 22590.000000 | 22590.000000 | 22590.000000 | 1.954297 | 1.703420 | 22588.571000 | 22434.160000 | 10724.374500 | 10410.932005 | 326.100000 | 326.100000 | 326.100000 | 326.100000 | 1.767732 | 1.675400 | 349.000000 | 218.696600 | 162.895331 | 169.075862 | 105.000000 | 105.000000 | 105.000000 | 105.000000 | 2.034410 | 1.747165 | 104.778000 | 102.675000 | 51.635000 | 51.680482 | 332.500000 | 406.960000 | 317.883627 | 376.032878 | 1.633977 | 1.612989 | 429.974170 | 401.440000 | 214.986150 | 213.300452 | 7.000000 | 7.000000 | 7.000000 | 7.000000 | 2.141963 | 1.949739 | 6.000000 | 6.992200 | 3.000000 | 3.000000 | 185.000000 |
scalar = StandardScaler()
x = df.copy(deep=True)
column_names = []
for col in x.columns:
column_names.append(col)
#Select numerical features
x = x[column_names].values
#Standardized values
x_standard = scalar.fit_transform(x)
standard_df = pd.DataFrame(x_standard, columns=column_names)
#Create dictionarie with the number of outliers for each feature
outliers = dict()
for i, val in enumerate(standard_df.columns):
outliers[val] = dict()
outliers[val]["count"] = len(np.where(abs(standard_df[val].values) > 3)[0])
outliers[val]["propotion"] = outliers[val]["count"] / standard_df[val].shape[0]
outliers = pd.DataFrame(data=outliers).T
outliers.sort_values(by="count", ascending=False)
| count | propotion | |
|---|---|---|
| std_FusionHeat | 1115.0 | 0.052602 |
| range_FusionHeat | 1050.0 | 0.049535 |
| wtd_std_FusionHeat | 887.0 | 0.041846 |
| range_ElectronAffinity | 639.0 | 0.030146 |
| wtd_mean_atomic_mass | 537.0 | 0.025334 |
| ... | ... | ... |
| std_atomic_radius | 0.0 | 0.000000 |
| entropy_ThermalConductivity | 0.0 | 0.000000 |
| std_fie | 0.0 | 0.000000 |
| wtd_std_fie | 0.0 | 0.000000 |
| range_ThermalConductivity | 0.0 | 0.000000 |
82 rows × 2 columns
x_train = df.loc[:, df.columns != 'critical_temp'] # x_train set
x_train = pd.DataFrame(x_train, columns=x_train.columns)
x_train
| number_of_elements | mean_atomic_mass | wtd_mean_atomic_mass | gmean_atomic_mass | wtd_gmean_atomic_mass | entropy_atomic_mass | wtd_entropy_atomic_mass | range_atomic_mass | wtd_range_atomic_mass | std_atomic_mass | wtd_std_atomic_mass | mean_fie | wtd_mean_fie | gmean_fie | wtd_gmean_fie | entropy_fie | wtd_entropy_fie | range_fie | wtd_range_fie | std_fie | wtd_std_fie | mean_atomic_radius | wtd_mean_atomic_radius | gmean_atomic_radius | wtd_gmean_atomic_radius | entropy_atomic_radius | wtd_entropy_atomic_radius | range_atomic_radius | wtd_range_atomic_radius | std_atomic_radius | wtd_std_atomic_radius | mean_Density | wtd_mean_Density | gmean_Density | wtd_gmean_Density | entropy_Density | wtd_entropy_Density | range_Density | wtd_range_Density | std_Density | wtd_std_Density | mean_ElectronAffinity | wtd_mean_ElectronAffinity | gmean_ElectronAffinity | wtd_gmean_ElectronAffinity | entropy_ElectronAffinity | wtd_entropy_ElectronAffinity | range_ElectronAffinity | wtd_range_ElectronAffinity | std_ElectronAffinity | wtd_std_ElectronAffinity | mean_FusionHeat | wtd_mean_FusionHeat | gmean_FusionHeat | wtd_gmean_FusionHeat | entropy_FusionHeat | wtd_entropy_FusionHeat | range_FusionHeat | wtd_range_FusionHeat | std_FusionHeat | wtd_std_FusionHeat | mean_ThermalConductivity | wtd_mean_ThermalConductivity | gmean_ThermalConductivity | wtd_gmean_ThermalConductivity | entropy_ThermalConductivity | wtd_entropy_ThermalConductivity | range_ThermalConductivity | wtd_range_ThermalConductivity | std_ThermalConductivity | wtd_std_ThermalConductivity | mean_Valence | wtd_mean_Valence | gmean_Valence | wtd_gmean_Valence | entropy_Valence | wtd_entropy_Valence | range_Valence | wtd_range_Valence | std_Valence | wtd_std_Valence | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4 | 88.944468 | 57.862692 | 66.361592 | 36.116612 | 1.181795 | 1.062396 | 122.90607 | 31.794921 | 51.968828 | 53.622535 | 775.425000 | 1010.268571 | 718.152900 | 938.016780 | 1.305967 | 0.791488 | 810.6 | 735.985714 | 323.811808 | 355.562967 | 160.250000 | 105.514286 | 136.126003 | 84.528423 | 1.259244 | 1.207040 | 205 | 42.914286 | 75.237540 | 69.235569 | 4654.35725 | 2961.502286 | 724.953211 | 53.543811 | 1.033129 | 0.814598 | 8958.571 | 1579.583429 | 3306.162897 | 3572.596624 | 81.837500 | 111.727143 | 60.123179 | 99.414682 | 1.159687 | 0.787382 | 127.05 | 80.987143 | 51.433712 | 42.558396 | 6.905500 | 3.846857 | 3.479475 | 1.040986 | 1.088575 | 0.994998 | 12.878 | 1.744571 | 4.599064 | 4.666920 | 107.756645 | 61.015189 | 7.062488 | 0.621979 | 0.308148 | 0.262848 | 399.97342 | 57.127669 | 168.854244 | 138.517163 | 2.25 | 2.257143 | 2.213364 | 2.219783 | 1.368922 | 1.066221 | 1 | 1.085714 | 0.433013 | 0.437059 |
| 1 | 5 | 92.729214 | 58.518416 | 73.132787 | 36.396602 | 1.449309 | 1.057755 | 122.90607 | 36.161939 | 47.094633 | 53.979870 | 766.440000 | 1010.612857 | 720.605511 | 938.745413 | 1.544145 | 0.807078 | 810.6 | 743.164286 | 290.183029 | 354.963511 | 161.200000 | 104.971429 | 141.465215 | 84.370167 | 1.508328 | 1.204115 | 205 | 50.571429 | 67.321319 | 68.008817 | 5821.48580 | 3021.016571 | 1237.095080 | 54.095718 | 1.314442 | 0.914802 | 10488.571 | 1667.383429 | 3767.403176 | 3632.649185 | 90.890000 | 112.316429 | 69.833315 | 101.166398 | 1.427997 | 0.838666 | 127.05 | 81.207857 | 49.438167 | 41.667621 | 7.784400 | 3.796857 | 4.403790 | 1.035251 | 1.374977 | 1.073094 | 12.878 | 1.595714 | 4.473363 | 4.603000 | 172.205316 | 61.372331 | 16.064228 | 0.619735 | 0.847404 | 0.567706 | 429.97342 | 51.413383 | 198.554600 | 139.630922 | 2.00 | 2.257143 | 1.888175 | 2.210679 | 1.557113 | 1.047221 | 2 | 1.128571 | 0.632456 | 0.468606 |
| 2 | 4 | 88.944468 | 57.885242 | 66.361592 | 36.122509 | 1.181795 | 0.975980 | 122.90607 | 35.741099 | 51.968828 | 53.656268 | 775.425000 | 1010.820000 | 718.152900 | 939.009036 | 1.305967 | 0.773620 | 810.6 | 743.164286 | 323.811808 | 354.804183 | 160.250000 | 104.685714 | 136.126003 | 84.214573 | 1.259244 | 1.132547 | 205 | 49.314286 | 75.237540 | 67.797712 | 4654.35725 | 2999.159429 | 724.953211 | 53.974022 | 1.033129 | 0.760305 | 8958.571 | 1667.383429 | 3306.162897 | 3592.019281 | 81.837500 | 112.213571 | 60.123179 | 101.082152 | 1.159687 | 0.786007 | 127.05 | 81.207857 | 51.433712 | 41.639878 | 6.905500 | 3.822571 | 3.479475 | 1.037439 | 1.088575 | 0.927479 | 12.878 | 1.757143 | 4.599064 | 4.649635 | 107.756645 | 60.943760 | 7.062488 | 0.619095 | 0.308148 | 0.250477 | 399.97342 | 57.127669 | 168.854244 | 138.540613 | 2.25 | 2.271429 | 2.213364 | 2.232679 | 1.368922 | 1.029175 | 1 | 1.114286 | 0.433013 | 0.444697 |
| 3 | 4 | 88.944468 | 57.873967 | 66.361592 | 36.119560 | 1.181795 | 1.022291 | 122.90607 | 33.768010 | 51.968828 | 53.639405 | 775.425000 | 1010.544286 | 718.152900 | 938.512777 | 1.305967 | 0.783207 | 810.6 | 739.575000 | 323.811808 | 355.183884 | 160.250000 | 105.100000 | 136.126003 | 84.371352 | 1.259244 | 1.173033 | 205 | 46.114286 | 75.237540 | 68.521665 | 4654.35725 | 2980.330857 | 724.953211 | 53.758486 | 1.033129 | 0.788889 | 8958.571 | 1623.483429 | 3306.162897 | 3582.370597 | 81.837500 | 111.970357 | 60.123179 | 100.244950 | 1.159687 | 0.786900 | 127.05 | 81.097500 | 51.433712 | 42.102344 | 6.905500 | 3.834714 | 3.479475 | 1.039211 | 1.088575 | 0.964031 | 12.878 | 1.744571 | 4.599064 | 4.658301 | 107.756645 | 60.979474 | 7.062488 | 0.620535 | 0.308148 | 0.257045 | 399.97342 | 57.127669 | 168.854244 | 138.528893 | 2.25 | 2.264286 | 2.213364 | 2.226222 | 1.368922 | 1.048834 | 1 | 1.100000 | 0.433013 | 0.440952 |
| 4 | 4 | 88.944468 | 57.840143 | 66.361592 | 36.110716 | 1.181795 | 1.129224 | 122.90607 | 27.848743 | 51.968828 | 53.588771 | 775.425000 | 1009.717143 | 718.152900 | 937.025573 | 1.305967 | 0.805230 | 810.6 | 728.807143 | 323.811808 | 356.319281 | 160.250000 | 106.342857 | 136.126003 | 84.843442 | 1.259244 | 1.261194 | 205 | 36.514286 | 75.237540 | 70.634448 | 4654.35725 | 2923.845143 | 724.953211 | 53.117029 | 1.033129 | 0.859811 | 8958.571 | 1491.783429 | 3306.162897 | 3552.668664 | 81.837500 | 111.240714 | 60.123179 | 97.774719 | 1.159687 | 0.787396 | 127.05 | 80.766429 | 51.433712 | 43.452059 | 6.905500 | 3.871143 | 3.479475 | 1.044545 | 1.088575 | 1.044970 | 12.878 | 1.744571 | 4.599064 | 4.684014 | 107.756645 | 61.086617 | 7.062488 | 0.624878 | 0.308148 | 0.272820 | 399.97342 | 57.127669 | 168.854244 | 138.493671 | 2.25 | 2.242857 | 2.213364 | 2.206963 | 1.368922 | 1.096052 | 1 | 1.057143 | 0.433013 | 0.428809 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21258 | 4 | 106.957877 | 53.095769 | 82.515384 | 43.135565 | 1.177145 | 1.254119 | 146.88130 | 15.504479 | 65.764081 | 43.202659 | 661.775000 | 753.793333 | 651.611213 | 750.570867 | 1.371139 | 0.927050 | 273.0 | 427.546667 | 114.383355 | 64.428777 | 176.500000 | 138.244444 | 169.947614 | 134.233861 | 1.350879 | 1.112222 | 111 | 57.808889 | 45.883003 | 35.811330 | 7341.25000 | 4963.928889 | 6404.741690 | 4082.735787 | 1.284617 | 1.110551 | 7511.000 | 2449.715556 | 2981.637585 | 2978.412680 | 63.825000 | 86.511111 | 49.825259 | 61.057784 | 1.164382 | 0.511822 | 117.90 | 74.140000 | 43.467883 | 55.219027 | 25.700000 | 34.448000 | 22.494622 | 29.041737 | 1.245563 | 0.631341 | 36.400 | 27.497778 | 14.446107 | 17.657417 | 65.500000 | 111.537778 | 42.371302 | 95.001493 | 1.029002 | 0.634332 | 134.00000 | 83.048889 | 55.056789 | 46.595943 | 3.25 | 3.555556 | 3.223710 | 3.519911 | 1.377820 | 0.913658 | 1 | 2.168889 | 0.433013 | 0.496904 |
| 21259 | 5 | 92.266740 | 49.021367 | 64.812662 | 32.867748 | 1.323287 | 1.571630 | 188.38390 | 7.353333 | 69.232655 | 50.148287 | 747.780000 | 989.819048 | 702.115184 | 923.426093 | 1.541006 | 0.988470 | 810.6 | 659.771429 | 293.286136 | 345.450969 | 159.200000 | 112.000000 | 139.743372 | 89.633687 | 1.507321 | 1.536691 | 205 | 20.285714 | 67.211309 | 72.795212 | 5174.28580 | 2827.415190 | 962.364248 | 66.286408 | 1.165065 | 1.080432 | 11848.571 | 1705.918143 | 4506.895480 | 3937.105612 | 60.484000 | 100.490952 | 29.177546 | 60.814837 | 1.112639 | 0.693424 | 138.63 | 74.090000 | 58.434336 | 57.634544 | 6.812400 | 4.793429 | 3.839585 | 1.315769 | 1.345508 | 1.203223 | 12.878 | 2.378952 | 4.340205 | 5.204855 | 132.805316 | 108.680590 | 17.747413 | 1.577047 | 0.949904 | 0.745515 | 399.97342 | 76.176553 | 151.164639 | 156.558695 | 2.20 | 2.047619 | 2.168944 | 2.038991 | 1.594167 | 1.337246 | 1 | 0.904762 | 0.400000 | 0.212959 |
| 21260 | 2 | 99.663190 | 95.609104 | 99.433882 | 95.464320 | 0.690847 | 0.530198 | 13.51362 | 53.041104 | 6.756810 | 5.405448 | 733.550000 | 691.580000 | 730.207231 | 689.480961 | 0.688594 | 0.542120 | 139.9 | 370.180000 | 69.950000 | 55.960000 | 183.500000 | 192.200000 | 182.926215 | 191.826893 | 0.690022 | 0.465055 | 29 | 124.600000 | 14.500000 | 11.600000 | 10296.50000 | 9260.600000 | 10150.719679 | 9170.377777 | 0.679023 | 0.572700 | 3453.000 | 4451.400000 | 1726.500000 | 1381.200000 | 71.400000 | 81.120000 | 69.537903 | 79.871364 | 0.667182 | 0.397810 | 32.40 | 59.040000 | 16.200000 | 12.960000 | 21.750000 | 24.780000 | 21.155614 | 24.380978 | 0.665945 | 0.395385 | 10.100 | 18.100000 | 5.050000 | 4.040000 | 62.500000 | 57.400000 | 61.919302 | 57.038314 | 0.683870 | 0.559446 | 17.00000 | 29.000000 | 8.500000 | 6.800000 | 4.50 | 4.800000 | 4.472136 | 4.781762 | 0.686962 | 0.450561 | 1 | 3.200000 | 0.500000 | 0.400000 |
| 21261 | 2 | 99.663190 | 97.095602 | 99.433882 | 96.901083 | 0.690847 | 0.640883 | 13.51362 | 31.115202 | 6.756810 | 6.249958 | 733.550000 | 706.969000 | 730.207231 | 704.143255 | 0.688594 | 0.648876 | 139.9 | 208.799000 | 69.950000 | 64.702805 | 183.500000 | 189.010000 | 182.926215 | 188.514109 | 0.690022 | 0.590271 | 29 | 84.230000 | 14.500000 | 13.412304 | 10296.50000 | 9640.430000 | 10150.719679 | 9518.329826 | 0.679023 | 0.667210 | 3453.000 | 2186.170000 | 1726.500000 | 1596.989169 | 71.400000 | 77.556000 | 69.537903 | 75.915236 | 0.667182 | 0.527718 | 32.40 | 43.332000 | 16.200000 | 14.984781 | 21.750000 | 23.669000 | 21.155614 | 23.144890 | 0.665945 | 0.525289 | 10.100 | 13.315000 | 5.050000 | 4.671182 | 62.500000 | 59.270000 | 61.919302 | 58.781651 | 0.683870 | 0.659671 | 17.00000 | 15.250000 | 8.500000 | 7.862385 | 4.50 | 4.690000 | 4.472136 | 4.665819 | 0.686962 | 0.577601 | 1 | 2.210000 | 0.500000 | 0.462493 |
| 21262 | 3 | 87.468333 | 86.858500 | 82.555758 | 80.458722 | 1.041270 | 0.895229 | 71.75500 | 43.144000 | 29.905282 | 33.927941 | 856.166667 | 821.190000 | 852.858789 | 818.631908 | 1.094784 | 0.968771 | 181.3 | 285.510000 | 74.569624 | 65.291691 | 127.333333 | 137.500000 | 125.493812 | 136.084313 | 1.084025 | 0.883461 | 53 | 67.700000 | 21.853045 | 19.345542 | 6311.00000 | 6914.900000 | 6186.508901 | 6830.731801 | 1.078970 | 0.874139 | 3055.000 | 3455.100000 | 1248.208583 | 1039.909655 | 135.133333 | 104.930000 | 86.530864 | 57.563783 | 0.839248 | 0.747947 | 179.30 | 68.080000 | 83.414480 | 87.740504 | 12.233333 | 14.440000 | 10.925390 | 13.816376 | 1.002899 | 0.826780 | 12.100 | 6.460000 | 5.062498 | 3.481724 | 27.506667 | 40.752000 | 4.976422 | 12.919996 | 0.194158 | 0.142553 | 78.48000 | 39.448000 | 36.425359 | 38.254432 | 5.00 | 4.500000 | 4.762203 | 4.242641 | 1.054920 | 0.970116 | 3 | 1.800000 | 1.414214 | 1.500000 |
21197 rows × 81 columns
y_train = df['critical_temp'] # y_train set
y_train = pd.DataFrame(y_train)
y_train
| critical_temp | |
|---|---|
| 0 | 29.00 |
| 1 | 26.00 |
| 2 | 19.00 |
| 3 | 22.00 |
| 4 | 23.00 |
| ... | ... |
| 21258 | 2.44 |
| 21259 | 122.10 |
| 21260 | 1.98 |
| 21261 | 1.84 |
| 21262 | 12.80 |
21197 rows × 1 columns
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.33, random_state=random_state)
X_train = pd.DataFrame(X_train, columns=X_train.columns)
X_test = pd.DataFrame(X_test, columns=X_test.columns)
Y_train = pd.DataFrame(Y_train)
Y_test = pd.DataFrame(Y_test)
print("Shapes:\n",
"X_train: ", X_train.shape, "\n",
"X_test: ", X_test.shape, "\n",
"Y_train: ", Y_train.shape, "\n",
"Y_test: ", Y_test.shape, "\n",
)
Shapes: X_train: (14201, 81) X_test: (6996, 81) Y_train: (14201, 1) Y_test: (6996, 1)
# Data Scaling
scaler = StandardScaler()
x_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=x_train.columns)
x_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
x_train_scaled
| number_of_elements | mean_atomic_mass | wtd_mean_atomic_mass | gmean_atomic_mass | wtd_gmean_atomic_mass | entropy_atomic_mass | wtd_entropy_atomic_mass | range_atomic_mass | wtd_range_atomic_mass | std_atomic_mass | wtd_std_atomic_mass | mean_fie | wtd_mean_fie | gmean_fie | wtd_gmean_fie | entropy_fie | wtd_entropy_fie | range_fie | wtd_range_fie | std_fie | wtd_std_fie | mean_atomic_radius | wtd_mean_atomic_radius | gmean_atomic_radius | wtd_gmean_atomic_radius | entropy_atomic_radius | wtd_entropy_atomic_radius | range_atomic_radius | wtd_range_atomic_radius | std_atomic_radius | wtd_std_atomic_radius | mean_Density | wtd_mean_Density | gmean_Density | wtd_gmean_Density | entropy_Density | wtd_entropy_Density | range_Density | wtd_range_Density | std_Density | wtd_std_Density | mean_ElectronAffinity | wtd_mean_ElectronAffinity | gmean_ElectronAffinity | wtd_gmean_ElectronAffinity | entropy_ElectronAffinity | wtd_entropy_ElectronAffinity | range_ElectronAffinity | wtd_range_ElectronAffinity | std_ElectronAffinity | wtd_std_ElectronAffinity | mean_FusionHeat | wtd_mean_FusionHeat | gmean_FusionHeat | wtd_gmean_FusionHeat | entropy_FusionHeat | wtd_entropy_FusionHeat | range_FusionHeat | wtd_range_FusionHeat | std_FusionHeat | wtd_std_FusionHeat | mean_ThermalConductivity | wtd_mean_ThermalConductivity | gmean_ThermalConductivity | wtd_gmean_ThermalConductivity | entropy_ThermalConductivity | wtd_entropy_ThermalConductivity | range_ThermalConductivity | wtd_range_ThermalConductivity | std_ThermalConductivity | wtd_std_ThermalConductivity | mean_Valence | wtd_mean_Valence | gmean_Valence | wtd_gmean_Valence | entropy_Valence | wtd_entropy_Valence | range_Valence | wtd_range_Valence | std_Valence | wtd_std_Valence | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.782197 | 1.355194 | 2.180013 | 1.442480 | 2.280814 | -0.432528 | -0.794273 | 0.229753 | 1.727512 | 0.397807 | -0.403240 | 0.655526 | -0.042104 | 1.114707 | 0.263273 | -0.539547 | -0.552544 | -1.454721 | 0.241337 | -1.438226 | -1.588450 | -0.426225 | 0.258402 | 0.144156 | 0.528668 | -0.490985 | -0.815531 | -1.231549 | 0.677758 | -1.235690 | -1.031048 | 2.273734 | 1.950002 | 1.987333 | 1.616745 | -0.370376 | -0.330099 | 1.851766 | 1.672875 | 2.209485 | 2.626139 | 2.805334 | 2.597498 | 3.350898 | 3.242455 | 0.021738 | -0.296389 | -0.836299 | 2.117532 | -0.902095 | -1.154572 | 0.415185 | 0.449956 | 0.798697 | 0.737603 | -0.096700 | -0.396105 | -0.405162 | 0.246150 | -0.342602 | -0.495874 | 2.452239 | -0.405501 | 0.784876 | -0.370198 | -0.340902 | 0.295008 | 0.918076 | -0.364187 | 1.076985 | -0.072416 | 1.406508 | 2.279004 | 1.056995 | 2.328312 | -0.748984 | -0.963929 | 1.564077 | 2.381904 | 2.145551 | 0.057192 |
| 1 | 0.610304 | 0.057860 | -0.627894 | -0.022547 | -0.636824 | 0.762305 | 0.810130 | 0.160989 | -0.530300 | 0.118091 | 0.184703 | -0.336711 | 0.921017 | -0.577064 | 0.857495 | 0.617919 | -0.063367 | 0.767899 | 0.964575 | 0.753193 | 0.937594 | 1.150132 | -0.683358 | 0.526940 | -0.869147 | 0.613529 | 0.707225 | 0.977086 | -0.462184 | 1.102327 | 1.094862 | -0.489582 | -0.707665 | -0.645007 | -0.764769 | 0.731538 | 0.136613 | 0.061310 | -0.350593 | -0.247333 | 0.202321 | -0.174367 | 0.517644 | -0.045471 | 0.465773 | 0.831838 | 0.043179 | 0.106084 | 0.583908 | 0.054989 | 0.223364 | -0.560185 | -0.608495 | -0.554588 | -0.669810 | 0.762365 | 0.479294 | -0.406257 | -0.468284 | -0.448315 | -0.292162 | -0.005093 | 0.321320 | -0.626997 | -0.650682 | -0.826510 | -1.052709 | 0.936842 | 0.696304 | 0.934583 | 1.102727 | -0.571631 | -0.890798 | -0.538089 | -0.836259 | 0.681403 | 0.580612 | -0.042364 | -0.479201 | -0.089027 | -0.743135 |
| 2 | -1.478448 | 0.102857 | 0.639025 | 0.582157 | 0.944713 | -1.346449 | -1.141501 | -1.546072 | 0.186206 | -1.435362 | -1.319149 | 1.198031 | -0.096001 | 1.709973 | 0.178574 | -1.606182 | -0.754384 | -1.394186 | -1.492810 | -1.318748 | -1.218275 | -0.816166 | 0.477226 | -0.252185 | 0.696353 | -1.592084 | -1.316214 | -1.261396 | 0.337617 | -1.059031 | -1.023575 | 0.970490 | 1.353939 | 1.310231 | 1.503158 | -1.298140 | -1.022768 | -0.586283 | 1.006136 | -0.168166 | -0.177708 | -0.345336 | -0.879239 | 0.405905 | -0.287533 | -1.159797 | -0.292677 | -1.646492 | -1.906840 | -1.695309 | -1.604452 | 0.712311 | 0.501392 | 1.133743 | 0.775186 | -1.155892 | -0.603892 | -0.499700 | -0.725896 | -0.325474 | -0.330597 | -0.760112 | -0.407714 | 0.876556 | 0.865147 | -0.152609 | 0.208927 | -1.454865 | -0.845984 | -1.476690 | -1.357216 | 1.246980 | 1.024083 | 1.355200 | 1.101170 | -1.558370 | -0.982698 | -0.845584 | -0.879381 | -0.706532 | -0.426589 |
| 3 | -0.085947 | 0.994871 | 0.880874 | 0.178165 | -0.220028 | -0.040494 | -0.509000 | 1.193483 | 1.300057 | 1.232974 | 2.165211 | -0.357505 | -0.528676 | -0.104386 | -0.366438 | 0.191103 | 0.471554 | -0.881423 | -0.764344 | -0.946160 | -1.050628 | 0.453216 | 0.309557 | 0.604756 | 0.350035 | 0.174794 | 0.178008 | 0.022001 | 0.198146 | -0.037026 | 0.010410 | 1.894026 | 1.903307 | 1.437010 | 1.047931 | 0.289202 | -0.096362 | 2.771142 | 2.026416 | 2.310275 | 3.607142 | 0.321949 | -0.287000 | 0.550344 | -0.309378 | 0.416294 | 0.531538 | 0.059178 | -0.233109 | 0.001913 | 0.654431 | 1.158001 | 1.488870 | 1.346832 | 1.624599 | 0.423285 | -0.074695 | 0.888511 | 1.155603 | 0.701835 | 0.929060 | -0.137530 | -0.039736 | 0.619755 | 0.620059 | 0.938153 | 0.928369 | -0.710165 | -0.140727 | -0.556934 | -0.527437 | 1.246980 | 0.989130 | 1.135830 | 0.874163 | 0.081161 | 0.139381 | 0.760856 | 0.489119 | 1.351815 | 1.760894 |
| 4 | 0.610304 | -0.449832 | -0.496962 | -0.348060 | -0.623945 | 0.782370 | -0.104449 | 0.124399 | 0.084365 | -0.233496 | 0.525447 | 0.466249 | 0.987577 | 0.355805 | 0.901630 | 0.654406 | -0.385210 | 0.642945 | 1.189374 | 0.631423 | 1.012400 | -0.402995 | -1.050624 | -0.498922 | -1.029246 | 0.664978 | 0.165369 | 0.469697 | -0.033529 | 0.312752 | 0.593274 | -0.398712 | -0.723515 | -0.639113 | -0.768196 | 0.689631 | -0.157669 | 0.061310 | -0.550963 | -0.119138 | 0.144395 | -0.466674 | 0.575393 | -1.039933 | 0.760173 | 0.141047 | 0.002275 | 0.344025 | 0.775774 | 0.409682 | -0.056957 | -0.643083 | -0.703573 | -0.599412 | -0.691740 | 0.744057 | 0.249318 | -0.406257 | -0.564118 | -0.488051 | -0.423831 | 0.617489 | -0.449686 | -0.456650 | -0.662050 | 0.324869 | -0.723837 | 0.936842 | -0.126728 | 0.832084 | 0.654358 | -0.954497 | -0.752523 | -0.846379 | -0.710930 | 0.755916 | 0.043490 | -0.845584 | -0.350638 | -0.912366 | -0.528867 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14196 | -1.478448 | -2.091198 | -1.405825 | -1.473700 | -0.890096 | -1.308487 | -1.008158 | -2.072790 | -0.990327 | -2.153611 | -2.016117 | -1.288085 | -1.606864 | -1.089361 | -1.645938 | -1.616962 | -0.719503 | -1.337860 | -1.920332 | -1.239441 | -1.144996 | -1.313963 | -0.205049 | -0.615774 | 0.201672 | -1.555417 | -1.099764 | -1.679246 | -1.105528 | -1.672889 | -1.555305 | -1.366573 | -0.914751 | -0.346034 | -0.213409 | -1.179536 | -0.768830 | -1.883625 | -0.820178 | -1.757000 | -1.770687 | -1.954035 | -2.033114 | -1.592309 | -1.921991 | -2.716123 | -2.339647 | -1.336060 | -1.168802 | -1.275729 | -1.160035 | -0.401912 | -0.273702 | -0.043013 | -0.016875 | -1.087329 | -0.725110 | -0.947509 | -0.466210 | -0.854606 | -0.938450 | 2.793794 | 2.706535 | 4.824045 | 4.316540 | -0.161787 | 0.248206 | -1.114070 | 0.342511 | -1.027166 | -0.938961 | -0.667347 | -0.464910 | -0.578210 | -0.429188 | -1.593956 | -1.156052 | -0.845584 | -0.494971 | -0.706532 | -0.414039 |
| 14197 | -0.085947 | 1.903616 | 1.372148 | 2.064753 | 1.480745 | 0.435372 | 0.756799 | -0.254412 | -0.788127 | 0.253313 | 0.067979 | 0.013241 | -1.036647 | 0.346485 | -0.962671 | 0.201414 | 1.121049 | -1.194131 | -1.488689 | -1.082748 | -1.097738 | 1.411475 | 1.932397 | 1.896733 | 1.933117 | 0.307522 | 0.256277 | -1.768785 | 0.027805 | -1.900108 | -1.779003 | 3.372158 | 2.251928 | 2.950196 | 2.132921 | 0.668801 | 1.619696 | 1.285060 | -0.717822 | 1.728166 | 1.340837 | 1.917123 | 0.387897 | 2.231033 | 0.799781 | 0.671622 | 2.083594 | 0.214394 | -1.588728 | 0.206587 | 0.048378 | 1.158001 | 1.102667 | 1.640390 | 1.431090 | 0.718784 | 0.708971 | -0.250916 | 0.258035 | -0.299589 | -0.292305 | 0.355348 | 0.401716 | 1.904362 | 1.565527 | 1.758071 | 2.039649 | -0.981539 | -0.438252 | -0.958567 | -0.852944 | 2.443435 | 2.072669 | 2.560108 | 2.168658 | 0.218172 | 0.567258 | -0.845584 | 0.017576 | -0.844415 | -0.426589 |
| 14198 | 0.610304 | 0.266109 | -0.481102 | 0.066117 | -0.591381 | 0.693471 | 1.045651 | 0.748462 | -0.695092 | 0.524779 | 0.562239 | -0.313861 | 0.891498 | -0.539369 | 0.833799 | 0.621797 | 0.049092 | 0.767899 | 0.870994 | 0.735292 | 0.921399 | 0.970925 | -0.666493 | 0.422465 | -0.846406 | 0.623916 | 0.905770 | 0.977086 | -0.689169 | 0.970904 | 1.048230 | -0.623851 | -0.657809 | -0.674376 | -0.763186 | 0.598599 | 0.249896 | 0.061310 | -0.333918 | -0.183294 | 0.262059 | -0.351612 | 0.528553 | -0.535126 | 0.452080 | 0.480154 | 0.131815 | 0.258230 | 0.559363 | 0.274346 | 0.175929 | -0.606537 | -0.617177 | -0.575859 | -0.668568 | 0.778857 | 0.532942 | -0.406257 | -0.463980 | -0.487880 | -0.315716 | 0.223187 | 0.413732 | -0.518312 | -0.645890 | -0.150049 | -0.746049 | 0.936842 | 0.738181 | 0.869623 | 1.103826 | -0.954497 | -0.902438 | -0.846379 | -0.842973 | 0.755916 | 0.613314 | -0.845584 | -0.498991 | -0.912366 | -0.894414 |
| 14199 | -0.085947 | 0.164755 | -0.362519 | -0.104229 | -0.590119 | 0.012663 | -0.174827 | 0.333730 | 0.124290 | 0.544126 | 0.835220 | 0.166624 | 0.982838 | -0.088200 | 0.892621 | 0.026302 | -0.432043 | 0.646183 | 1.141963 | 0.915079 | 1.018390 | -0.193920 | -0.685790 | -0.547400 | -0.912746 | -0.005710 | -0.091954 | 0.753238 | 0.224088 | 0.783566 | 1.191428 | -0.126463 | -0.589818 | -0.690544 | -0.767177 | 0.053108 | -0.112398 | 0.061310 | -0.401759 | -0.005779 | 0.362177 | 0.520332 | 0.634797 | 0.949178 | 0.992737 | 0.620498 | 0.170659 | -0.508810 | 0.737250 | -0.381553 | -0.228306 | -0.657693 | -0.666565 | -0.661538 | -0.685960 | -0.047294 | -0.029471 | -0.406257 | -0.539122 | -0.422124 | -0.372554 | 0.420372 | -0.458102 | -0.690902 | -0.662523 | -1.440428 | -0.932149 | 0.936842 | -0.117210 | 1.175651 | 0.663767 | -0.428057 | -0.710579 | -0.403610 | -0.677101 | 0.112926 | 0.107950 | -0.042364 | -0.436394 | -0.029014 | -0.379847 |
| 14200 | -1.478448 | 0.460167 | 0.993853 | 0.855996 | 1.212113 | -1.398881 | -1.176656 | -1.159008 | 0.487224 | -0.907551 | -0.791195 | 1.572205 | 0.204162 | 2.154177 | 0.553964 | -1.599816 | -0.747818 | -1.606219 | -1.512602 | -1.617288 | -1.463344 | -1.961099 | -0.532680 | -1.179136 | -0.045297 | -1.543243 | -1.157021 | -1.947864 | -0.661705 | -2.067511 | -1.905235 | -0.044777 | 0.233861 | 0.683581 | 0.731609 | -1.113070 | -0.604060 | -1.992831 | -0.602926 | -1.890743 | -1.906671 | 2.116406 | 1.672274 | 2.372663 | 1.964612 | -1.370022 | -0.881358 | -0.147208 | 0.838452 | 0.331127 | 0.515563 | 0.747966 | 0.554957 | 1.184261 | 0.839474 | -1.141669 | -0.604862 | -0.539505 | -0.678272 | -0.372508 | -0.376533 | -1.642104 | -1.314825 | -0.514486 | -0.448631 | -1.561900 | -0.805520 | -1.290778 | -1.017752 | -1.260253 | -1.154844 | 2.204144 | 2.051697 | 2.315947 | 2.146972 | -1.553146 | -1.064936 | -0.845584 | 0.120085 | -0.706532 | -0.414039 |
14201 rows × 81 columns
x_test_scaled
| number_of_elements | mean_atomic_mass | wtd_mean_atomic_mass | gmean_atomic_mass | wtd_gmean_atomic_mass | entropy_atomic_mass | wtd_entropy_atomic_mass | range_atomic_mass | wtd_range_atomic_mass | std_atomic_mass | wtd_std_atomic_mass | mean_fie | wtd_mean_fie | gmean_fie | wtd_gmean_fie | entropy_fie | wtd_entropy_fie | range_fie | wtd_range_fie | std_fie | wtd_std_fie | mean_atomic_radius | wtd_mean_atomic_radius | gmean_atomic_radius | wtd_gmean_atomic_radius | entropy_atomic_radius | wtd_entropy_atomic_radius | range_atomic_radius | wtd_range_atomic_radius | std_atomic_radius | wtd_std_atomic_radius | mean_Density | wtd_mean_Density | gmean_Density | wtd_gmean_Density | entropy_Density | wtd_entropy_Density | range_Density | wtd_range_Density | std_Density | wtd_std_Density | mean_ElectronAffinity | wtd_mean_ElectronAffinity | gmean_ElectronAffinity | wtd_gmean_ElectronAffinity | entropy_ElectronAffinity | wtd_entropy_ElectronAffinity | range_ElectronAffinity | wtd_range_ElectronAffinity | std_ElectronAffinity | wtd_std_ElectronAffinity | mean_FusionHeat | wtd_mean_FusionHeat | gmean_FusionHeat | wtd_gmean_FusionHeat | entropy_FusionHeat | wtd_entropy_FusionHeat | range_FusionHeat | wtd_range_FusionHeat | std_FusionHeat | wtd_std_FusionHeat | mean_ThermalConductivity | wtd_mean_ThermalConductivity | gmean_ThermalConductivity | wtd_gmean_ThermalConductivity | entropy_ThermalConductivity | wtd_entropy_ThermalConductivity | range_ThermalConductivity | wtd_range_ThermalConductivity | std_ThermalConductivity | wtd_std_ThermalConductivity | mean_Valence | wtd_mean_Valence | gmean_Valence | wtd_gmean_Valence | entropy_Valence | wtd_entropy_Valence | range_Valence | wtd_range_Valence | std_Valence | wtd_std_Valence | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.306555 | 0.334165 | -0.419664 | -0.014535 | -0.626718 | 1.008033 | 1.083041 | 1.405015 | -0.321791 | 1.227995 | 1.181915 | -0.250489 | 0.992170 | -0.320051 | 1.004719 | 1.146015 | 0.322742 | 0.617372 | 0.974192 | 0.417869 | 0.834902 | 0.196021 | -1.042166 | 0.040596 | -0.993816 | 1.174148 | 1.374300 | 0.514467 | -0.830453 | 0.361047 | 0.524424 | -0.258936 | -0.704816 | -0.580864 | -0.766137 | 1.033259 | 1.162524 | 0.260752 | -0.673723 | 0.374641 | 0.440543 | -0.267787 | 0.434649 | -0.645118 | -0.039040 | 1.033708 | 0.268355 | 0.303600 | 0.614585 | 0.199037 | 0.341818 | -0.411390 | -0.619442 | -0.429184 | -0.671899 | 1.321201 | 1.682306 | -0.222157 | -0.586911 | -0.369920 | -0.313067 | 0.524079 | -0.178020 | -0.442188 | -0.654768 | 0.738500 | 0.818370 | 0.936842 | -0.226341 | 0.789459 | 0.650119 | -0.507820 | -0.624658 | -0.536731 | -0.668001 | 1.069458 | 0.967296 | 0.760856 | -0.460712 | 0.539883 | 0.724916 |
| 1 | 0.610304 | 0.140465 | -0.410864 | 0.016523 | -0.603968 | 0.740366 | 0.234766 | 0.333730 | 0.001998 | 0.253318 | 0.751189 | -0.219946 | 0.996772 | -0.403260 | 0.919825 | 0.633091 | -0.274733 | 0.646183 | 1.141963 | 0.683308 | 0.999207 | 0.383524 | -0.709740 | 0.047412 | -0.920298 | 0.646600 | 0.321668 | 0.753238 | 0.042812 | 0.645321 | 1.154730 | -0.216270 | -0.613330 | -0.598906 | -0.767364 | 0.831456 | 0.324563 | 0.061310 | -0.481658 | -0.195611 | 0.330969 | 0.085694 | 0.618124 | 0.451715 | 0.949392 | 1.130012 | 0.244078 | -0.160853 | 0.747095 | -0.244510 | -0.181220 | -0.576230 | -0.661128 | -0.569157 | -0.685294 | 0.723624 | 0.505562 | -0.406257 | -0.559308 | -0.435302 | -0.360814 | -0.041411 | -0.455782 | -0.650204 | -0.662415 | -0.971017 | -0.848555 | 0.936842 | -0.117210 | 0.945724 | 0.663216 | -0.380198 | -0.710579 | -0.336916 | -0.677101 | 0.704719 | 0.402872 | -0.042364 | -0.429805 | -0.195379 | -0.379847 |
| 2 | -0.782197 | 0.252380 | -0.414731 | -0.262988 | -0.753002 | -0.994672 | -0.866623 | 0.945576 | 0.244512 | 1.211634 | 1.480720 | 0.668856 | 1.900437 | 0.042562 | 2.200917 | -0.804877 | -1.227990 | 1.090642 | 1.957835 | 1.439668 | 0.440877 | 0.536182 | -1.472135 | -0.429557 | -1.361974 | -0.909165 | -0.379473 | 1.156165 | -0.493244 | 1.692161 | 0.775233 | 0.286790 | -0.189206 | -0.837170 | -0.776760 | -2.363929 | -2.428749 | 2.564056 | 0.688110 | 3.160510 | 2.990152 | 0.483661 | 0.946174 | 0.949408 | 1.443471 | -0.196763 | -0.898839 | -0.455934 | 1.419230 | -0.453087 | -0.658353 | -0.154941 | -0.365119 | -0.745143 | -0.708312 | -2.232150 | -2.061980 | 0.683414 | 0.011281 | 0.903393 | 0.991955 | -0.357798 | -0.844600 | -0.686343 | -0.670182 | -0.488646 | -0.968302 | -0.514691 | -0.500626 | -0.476913 | -0.384311 | -0.188766 | -0.205442 | -0.731209 | -0.456180 | -1.145811 | -0.772135 | 2.367297 | -0.098634 | 2.710833 | 2.308579 |
| 3 | -0.085947 | -1.208293 | -0.954077 | -0.897135 | -0.735068 | 0.204932 | 0.360330 | -0.812840 | -0.690344 | -0.892239 | -0.662078 | 0.332860 | 0.939266 | 0.156511 | 0.938359 | 0.046307 | -0.104955 | 0.617372 | 0.791414 | 0.816914 | 0.836115 | -0.318369 | -0.922682 | -0.600846 | -0.927253 | 0.014234 | 0.395911 | 0.469697 | -0.668986 | 0.603469 | 0.608187 | -0.992152 | -0.747910 | -0.802060 | -0.764569 | -0.692639 | -0.983812 | 0.061310 | -0.231268 | -0.013955 | 0.266862 | -0.314677 | 0.465204 | -0.990316 | -0.100781 | -0.652230 | -0.370461 | 0.303600 | 0.543982 | 0.672861 | 0.442594 | -0.601090 | -0.601701 | -0.627829 | -0.665300 | 0.036204 | -0.004411 | -0.406257 | -0.431119 | -0.428064 | -0.293115 | 1.788756 | 0.869919 | -0.389017 | -0.636564 | 0.267181 | -0.202885 | 0.936842 | 0.996020 | 0.986334 | 1.194279 | -1.145929 | -0.968232 | -1.007869 | -0.897970 | 0.225633 | 0.212247 | -1.648805 | -0.548924 | -1.735705 | -1.478668 |
| 4 | 2.002805 | 0.743780 | -0.415774 | 0.304921 | -0.612102 | 1.494279 | 1.283314 | 1.405015 | -0.361580 | 1.393472 | 1.124376 | -0.397983 | 0.952036 | -0.447934 | 0.949174 | 1.560259 | 0.387796 | 0.674022 | 0.974541 | 0.297841 | 0.850626 | 0.026533 | -0.991195 | 0.025287 | -0.964609 | 1.624071 | 1.297820 | 0.469697 | -0.638515 | 0.068897 | 0.556933 | -0.075529 | -0.708748 | -0.472704 | -0.765361 | 1.619203 | 1.411190 | 0.640177 | -0.710517 | 0.419463 | 0.432758 | -0.437313 | 0.362343 | -0.640622 | -0.154960 | 1.517544 | 0.231955 | 0.303600 | 0.589501 | 0.078131 | 0.441133 | -0.596334 | -0.633996 | -0.508691 | -0.672198 | 1.786567 | 1.516328 | -0.406257 | -0.574177 | -0.517153 | -0.350558 | 0.246124 | -0.197825 | -0.385199 | -0.653220 | 1.199891 | 0.860676 | 0.936842 | -0.207711 | 0.650248 | 0.642817 | -0.325503 | -0.649540 | -0.370203 | -0.685290 | 1.469934 | 1.097855 | 0.760856 | -0.426768 | 0.579644 | 0.644082 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6991 | -0.085947 | 0.061498 | -0.532575 | -0.150423 | -0.595028 | 0.035731 | 0.655114 | 0.160989 | -0.792751 | 0.394075 | 0.343387 | 0.020668 | 0.835052 | -0.312512 | 0.741530 | 0.005902 | -0.112150 | 0.767899 | 0.733299 | 1.006991 | 0.963271 | 0.764339 | -0.591339 | 0.002933 | -0.813894 | -0.059769 | 0.522653 | 0.977086 | -0.922599 | 1.422283 | 1.155525 | -0.468104 | -0.607611 | -0.731527 | -0.761593 | -0.113909 | -0.098222 | 0.061310 | -0.252999 | -0.036973 | 0.312244 | 0.195255 | 0.574459 | 0.216005 | 0.592603 | 0.265883 | 0.158596 | 0.106084 | 0.451375 | 0.101774 | 0.088750 | -0.637637 | -0.609336 | -0.647168 | -0.665574 | 0.002405 | 0.033938 | -0.406257 | -0.437982 | -0.432872 | -0.304761 | 0.465769 | 0.544809 | -0.666859 | -0.645587 | -1.284537 | -1.170704 | 0.936842 | 0.941431 | 1.159755 | 1.203039 | -0.667347 | -0.844914 | -0.646150 | -0.809008 | 0.087584 | 0.481187 | -0.042364 | -0.750774 | 0.046876 | -0.344453 |
| 6992 | -0.085947 | -0.617192 | -0.094680 | -0.099492 | 0.280370 | 0.555737 | 0.299680 | -1.541017 | -0.215174 | -1.582173 | -1.471937 | -0.192983 | -0.552077 | 0.020663 | -0.469466 | 0.175752 | 0.575817 | -0.570334 | -0.603660 | -0.688036 | -0.603593 | 0.117203 | 0.590803 | 0.529010 | 0.732829 | 0.237862 | 0.213186 | -0.515235 | -0.045333 | -0.613016 | -0.553056 | 0.060296 | 0.225919 | 0.608461 | 0.622740 | 0.674715 | 0.781207 | -0.592607 | -0.192349 | -0.613979 | -0.839665 | -1.255820 | -1.569777 | -0.921945 | -1.410496 | 0.022258 | 0.010758 | -0.816342 | -1.004712 | -0.831094 | -0.586385 | 0.197540 | 0.322240 | 0.479969 | 0.489563 | 0.529323 | 0.205505 | -0.066817 | 0.204667 | -0.130314 | 0.043557 | -0.617437 | -0.487865 | 0.917556 | 0.721509 | 1.786969 | 1.932624 | -1.177180 | -0.880023 | -1.231797 | -1.218332 | 0.289816 | 0.408856 | 0.244098 | 0.322656 | 0.091556 | 0.046922 | 0.760856 | 0.359547 | 0.565597 | 1.120013 |
| 6993 | 1.306555 | 0.221467 | -0.498930 | 0.002771 | -0.604308 | 1.148019 | 0.957241 | 0.642559 | -0.456149 | 0.611064 | 0.525387 | -0.689977 | 0.906421 | -0.893205 | 0.830230 | 1.109055 | -0.053008 | 0.767899 | 0.990568 | 0.589834 | 0.956655 | 1.374140 | -0.657620 | 0.868554 | -0.860705 | 1.148542 | 0.780544 | 0.977086 | -0.374174 | 0.866348 | 1.133108 | -0.406056 | -0.629291 | -0.588157 | -0.764143 | 1.144053 | 0.452432 | 0.087092 | -0.350593 | 0.048607 | 0.360545 | -0.461841 | 0.550801 | -0.587935 | 0.524292 | 1.087995 | 0.149042 | 0.303600 | 0.611085 | 0.093429 | 0.151003 | -0.424761 | -0.574484 | -0.455944 | -0.666608 | 1.202709 | 0.704349 | -0.067911 | -0.461696 | -0.264865 | -0.176636 | 0.463550 | 0.334237 | -0.463585 | -0.650621 | 0.606196 | -0.938408 | 0.936842 | 0.696304 | 0.810073 | 1.104396 | -0.667347 | -0.894024 | -0.623725 | -0.839007 | 1.150822 | 0.615173 | -0.042364 | -0.424003 | -0.163617 | -0.753321 |
| 6994 | -0.782197 | 0.627589 | 0.799582 | 1.026894 | 1.054208 | -0.269704 | 0.035524 | -0.981248 | -0.672325 | -0.950510 | -0.912460 | -0.220022 | -0.492548 | -0.155517 | -0.436228 | -0.616626 | 0.152864 | -0.419807 | -0.910897 | -0.283269 | -0.489657 | 1.033979 | 1.014855 | 1.145933 | 0.996134 | -0.594746 | -0.123059 | -0.007846 | -0.840387 | 0.241325 | -0.059039 | 0.342568 | 0.779571 | 0.751370 | 0.971681 | -0.293036 | 0.054424 | -0.047062 | 0.488270 | 0.104652 | 0.126571 | -0.969338 | -1.106303 | -0.469704 | -0.721490 | -0.393850 | 0.269618 | -0.968488 | -1.067076 | -1.038243 | -1.031875 | 0.290392 | 0.397455 | 0.534420 | 0.580490 | -0.285166 | -0.063798 | -0.066817 | 0.121452 | -0.024370 | -0.024431 | -1.127609 | -0.651915 | 0.300481 | 0.484590 | 0.758254 | 1.069828 | -1.252912 | -0.865070 | -1.288690 | -1.211558 | 0.449344 | 0.709507 | 0.349437 | 0.640080 | -0.667463 | -0.295153 | 0.760856 | 0.120085 | 0.831505 | 0.901914 |
| 6995 | -1.478448 | 2.157957 | 2.334858 | 2.251598 | 2.243125 | -1.494333 | -1.101658 | -0.088798 | 0.819696 | 0.551820 | 0.699613 | -0.799659 | -1.195420 | -0.484080 | -1.115857 | -1.598059 | -0.710573 | -1.757717 | -2.081606 | -1.830596 | -1.640655 | 0.702114 | 1.300661 | 1.214275 | 1.403447 | -1.555970 | -1.101096 | -1.544937 | -0.955666 | -1.475577 | -1.365909 | 1.649909 | 1.708441 | 1.985267 | 1.928031 | -1.113898 | -0.516388 | -1.859789 | -0.982501 | -1.727809 | -1.734386 | -0.790794 | -1.167225 | -0.089656 | -0.648458 | -1.278866 | -0.483363 | -1.433283 | -1.427487 | -1.407136 | -1.278821 | 0.550526 | 0.470175 | 0.299893 | 0.232409 | -1.959915 | -1.503576 | 0.506877 | 0.672723 | 0.863897 | 1.108709 | -0.072678 | 0.116358 | 1.175359 | 1.050983 | -0.691297 | -0.124700 | -0.931051 | -0.235832 | -0.785755 | -0.698833 | 1.725562 | 1.548376 | 1.763212 | 1.569111 | -1.593956 | -1.008824 | -0.042364 | -0.494971 | 0.322642 | 0.694496 |
6996 rows × 81 columns
c = X_train.columns
variances = []
for c in X_train:
var = np.var(np.log(df[c]))
#var = np.var(X_train[c])
d = {"Features": c, "Variance": var}
variances.append(d)
var_log = pd.DataFrame(variances)
fig = px.bar(var_log, x='Features', y='Variance', width=1000, height=800)
fig.update_layout(xaxis=dict(tickfont=dict(size=5)))
fig.update_layout(yaxis_title="Log-Variance")
fig.show()
C:\Users\raimu\anaconda3\lib\site-packages\pandas\core\arraylike.py:364: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
# function to test the explain variance score with linear regression, random forest regressor and decision tree regressor
def naif_model_testing(X_train, X_test, y_train, y_test):
# test 3 approaches and print out the results
rfr = RandomForestRegressor(n_estimators=100, random_state=random_state)
rfr.fit(X_train, y_train)
dtr = DecisionTreeRegressor(max_depth=5, random_state=random_state)
dtr.fit(X_train, y_train)
lmr = LinearRegression()
lmr.fit(X_train, y_train)
rf_preds = rfr.predict(X_test)
dt_preds = dtr.predict(X_test)
lr_preds = lmr.predict(X_test)
print("RVE RFs: %7.4f" % explained_variance_score(y_test, rf_preds))
print("RVE DTs: %7.4f" % explained_variance_score(y_test, dt_preds))
print("RVE LRs: %7.4f" % explained_variance_score(y_test, lr_preds))
# function to select the best features with Select From Model algorithm for the regressors: Random Forest and Decision Tree
def sfm(x_train_scaled, Y_train, x_test_scaled, Y_test, estimator):
N, M = x_train_scaled.shape
est = estimator
sel = SelectFromModel(estimator=est)
sel.fit(x_train_scaled, Y_train.values.ravel())
print("Threshold: ", sel.threshold_)
features = sel.get_support()
Features_selected = np.arange(M)[features]
importances = sel.estimator_.feature_importances_
features_index = sel.get_support(indices=True)
importances_list = []
features_name = []
for i in features_index:
importances_list.append(importances[i])
features_name.append(x_train_scaled.columns[i])
f_index_df = pd.DataFrame(Features_selected)
fnames_df = pd.DataFrame(features_name)
imp_df = pd.DataFrame(importances_list)
imp_selected = pd.concat([f_index_df, fnames_df, imp_df], axis=1)
imp_selected.columns = ["Index", "Selected Features", "Importance"]
imp_selected_sort = imp_selected.sort_values(by="Importance", ascending=False)
print("\nBest Selected Features with the respective Importance:\n")
print(imp_selected_sort.to_string(index=False), "\n")
print("\nExplained Variance Score:")
nX_train = sel.transform(x_train_scaled)
nX_test = sel.transform(x_test_scaled)
naif_model_testing(nX_train, nX_test, Y_train.values.ravel(), Y_test.values.ravel())
# function to select the best features with Select From Model algorithm for Linear Regression classifier
def sfm_linear(x_train_scaled, Y_train, x_test_scaled, Y_test):
N, M = x_train_scaled.shape
est = LinearRegression()
sel = SelectFromModel(estimator=est, threshold='1.25*mean')
sel.fit(x_train_scaled, Y_train.values.ravel())
print("Threshold: ", sel.threshold_)
features = sel.get_support()
Features_selected = np.arange(M)[features]
coef = sel.estimator_.coef_
features_index = sel.get_support(indices=True)
coeficient_list = []
features_name = []
for i in features_index:
coeficient_list.append(coef[i])
features_name.append(x_train_scaled.columns[i])
f_index_df = pd.DataFrame(Features_selected)
fnames_df = pd.DataFrame(features_name)
imp_df = pd.DataFrame(coeficient_list)
imp_selected = pd.concat([f_index_df, fnames_df, imp_df], axis=1)
imp_selected.columns = ["Index", "Selected Features", "Coeficient"]
imp_selected_sort = imp_selected.sort_values(by="Coeficient", ascending=False)
print("\nBest Selected Features with the respective Coeficient:\n")
print(imp_selected_sort.to_string(index=False), "\n")
print("\nExplained Variance Score:")
nX_train = sel.transform(x_train_scaled)
nX_test = sel.transform(x_test_scaled)
naif_model_testing(nX_train, nX_test, Y_train.values.ravel(), Y_test.values.ravel())
# function to select the best features with Sequenctial Features Selector algorithm for Random Forest Regressor, Decision Tree Regressor and Linear Regression
def sfs(x_train_scaled, Y_train, x_test_scaled, Y_test, estimator):
N, M = x_train_scaled.shape
sfs = SequentialFeatureSelector(estimator, n_features_to_select=9)
sfs.fit(x_train_scaled, Y_train.values.ravel())
features = sfs.get_support()
Features_selected = np.arange(M)[features]
features_index = sfs.get_support(indices=True)
features_name = []
for i in features_index:
features_name.append(x_train_scaled.columns[i])
f_index_df = pd.DataFrame(Features_selected)
fnames_df = pd.DataFrame(features_name)
sfs_selected = pd.concat([f_index_df, fnames_df], axis=1)
sfs_selected.columns = ["Index", "Selected Features"]
print("\nBest Selected Features:\n")
print(sfs_selected, "\n")
print("\nExplained Variance Score:")
nX_train = sfs.transform(x_train_scaled)
nX_test = sfs.transform(x_test_scaled)
naif_model_testing(nX_train, nX_test, Y_train.values.ravel(), Y_test.values.ravel())
# function to perform GridSearchCV
def best_model(x, y, estimator, params, cv):
clf = GridSearchCV(estimator=estimator, param_grid=params, scoring='explained_variance', cv=cv, error_score='raise',
n_jobs=-1).fit(x, y)
return {
'Estimator': clf.best_estimator_,
'Best Params': clf.best_params_,
'Best Score': clf.best_score_,
'Detailed Results': clf.cv_results_,
}
# function to convert the dataframe with the final results of GridSearch to a CSV file
def result_to_file(algorithm_name, result_map, file_name):
new_result = pd.DataFrame(result_map['Detailed Results'])
new_result = new_result[['params', 'mean_test_score']]
new_result = new_result.assign(Algorithm=algorithm_name)
result = new_result[['Algorithm', 'params', 'mean_test_score']]
result = pd.DataFrame(result)
result.to_csv(file_name)
return result
In this step, we will find the best features with Select From Model and Sequential Features Selector algorithms support with a Pearson Correlation analysis as the first step. Then, we will perform a PCA in order to reduce the dimensionality of the dataset.
# correlation matrix
corr = df.corr(method='pearson')
fig = px.imshow(corr, text_auto=True, width=800, height=800)
fig.update_layout(
yaxis=dict(tickfont=dict(size=5)),
xaxis=dict(tickfont=dict(size=5)))
fig.show()
High degree: If the coefficient value lies between ± 0.50 and ± 1, then it is said to be a strong correlation.
Moderate degree: If the value lies between ± 0.30 and ± 0.49, then it is said to be a medium correlation.
Low degree: When the value lies below + . 29, then it is said to be a small correlation.
So in the step below, we will selected the features that are tightly correlated (correlation >= 0.50) with the target 'critical_temp'.
corr_target = pd.DataFrame(corr['critical_temp'])
corr_target = corr_target.drop(index="critical_temp")
corr_target.columns = ['critical_temp']
print("Features that are more correlated with target variable 'critical_temp':")
corr_target.loc[corr_target['critical_temp'] >= 0.5].sort_values(by='critical_temp', ascending=False)
Features that are more correlated with target variable 'critical_temp':
| critical_temp | |
|---|---|
| wtd_std_ThermalConductivity | 0.720960 |
| range_ThermalConductivity | 0.687282 |
| range_atomic_radius | 0.653394 |
| std_ThermalConductivity | 0.653232 |
| wtd_entropy_atomic_mass | 0.626536 |
| wtd_entropy_atomic_radius | 0.602986 |
| number_of_elements | 0.600394 |
| range_fie | 0.600179 |
| wtd_std_atomic_radius | 0.598972 |
| entropy_Valence | 0.598024 |
| wtd_entropy_Valence | 0.589204 |
| wtd_std_fie | 0.581442 |
| entropy_fie | 0.567156 |
| wtd_entropy_FusionHeat | 0.562808 |
| std_atomic_radius | 0.559132 |
| entropy_atomic_radius | 0.558249 |
| entropy_FusionHeat | 0.552069 |
| entropy_atomic_mass | 0.542894 |
| std_fie | 0.541091 |
We will select the best features with Select From Model algorithm with Random Forest Regressor, Decision Tree Regressor and Linear Regression. Additionally, we evaluate the explained variance with RF, DT and LR in each step.
sfm(x_train_scaled, Y_train, x_test_scaled, Y_test, RandomForestRegressor(random_state=random_state))
Threshold: 0.012345679012345678
Best Selected Features with the respective Importance:
Index Selected Features Importance
67 range_ThermalConductivity 0.535776
64 wtd_gmean_ThermalConductivity 0.122333
9 std_atomic_mass 0.029559
74 wtd_gmean_Valence 0.022253
66 wtd_entropy_ThermalConductivity 0.012598
72 wtd_mean_Valence 0.012446
Explained Variance Score:
RVE RFs: 0.9054
RVE DTs: 0.7628
RVE LRs: 0.5367
sfm(x_train_scaled, Y_train, x_test_scaled, Y_test, DecisionTreeRegressor(random_state=random_state))
Threshold: 0.012345679012345678
Best Selected Features with the respective Importance:
Index Selected Features Importance
67 range_ThermalConductivity 0.537984
64 wtd_gmean_ThermalConductivity 0.122310
74 wtd_gmean_Valence 0.039756
9 std_atomic_mass 0.032484
78 wtd_range_Valence 0.014258
47 range_ElectronAffinity 0.014253
10 wtd_std_atomic_mass 0.013537
31 mean_Density 0.013443
62 wtd_mean_ThermalConductivity 0.012556
Explained Variance Score:
RVE RFs: 0.9104
RVE DTs: 0.7687
RVE LRs: 0.6062
sfm_linear(x_train_scaled, Y_train, x_test_scaled, Y_test)
Threshold: 18.08131379095711
Best Selected Features with the respective Coeficient:
Index Selected Features Coeficient
22 wtd_mean_atomic_radius 86.176307
72 wtd_mean_Valence 32.069839
75 entropy_Valence 31.440442
49 std_ElectronAffinity 28.530402
1 mean_atomic_mass 27.228482
14 wtd_gmean_fie 26.815100
4 wtd_gmean_atomic_mass 25.051517
62 wtd_mean_ThermalConductivity 23.832226
25 entropy_atomic_radius 22.967635
73 gmean_Valence 20.717748
54 wtd_gmean_FusionHeat 19.613724
17 range_fie 19.120252
26 wtd_entropy_atomic_radius 18.956435
44 wtd_gmean_ElectronAffinity -19.491360
19 std_fie -20.478910
47 range_ElectronAffinity -23.460544
76 wtd_entropy_Valence -23.898121
52 wtd_mean_FusionHeat -26.328234
12 wtd_mean_fie -28.595962
2 wtd_mean_atomic_mass -32.072898
74 wtd_gmean_Valence -36.704486
15 entropy_fie -43.261368
24 wtd_gmean_atomic_radius -92.256168
Explained Variance Score:
RVE RFs: 0.9176
RVE DTs: 0.7525
RVE LRs: 0.6497
We will select the best features with Sequential Feature Selector algorithm with Random Forest Regressor, Decision Tree Regressor and Linear Regression. Additionally, we evaluate the explained variance with RF, DT and LR in each step.
#sfs(x_train_scaled, Y_train, x_test_scaled, Y_test, RandomForestRegressor(random_state= random_state))
sfsRF_out = open("appendix/sfs_RandomForest.txt", "r")
print(sfsRF_out.read())
Best Selected Features: Index Selected Features 0 7 range_atomic_mass 1 10 wtd_std_atomic_mass 2 15 entropy_fie 3 20 wtd_std_fie 4 27 range_atomic_radius 5 38 wtd_range_Density 6 44 wtd_gmean_ElectronAffinity 7 61 mean_ThermalConductivity 8 72 wtd_mean_Valence Explained Variance Score: RVE RFs: 0.9167 RVE DTs: 0.7385 RVE LRs: 0.5632
#sfs(x_train_scaled, Y_train, x_test_scaled, Y_test, DecisionTreeRegressor(random_state= random_state))
sfsDT_out = open("appendix/sfs_DecisionTree.txt", "r")
print(sfsDT_out.read())
Best Selected Features: Index Selected Features 0 3 gmean_atomic_mass 1 25 entropy_atomic_radius 2 30 wtd_std_atomic_radius 3 37 range_Density 4 44 wtd_gmean_ElectronAffinity 5 47 range_ElectronAffinity 6 49 std_ElectronAffinity 7 51 mean_FusionHeat 8 68 wtd_range_ThermalConductivity Explained Variance Score: RVE RFs: 0.9109 RVE DTs: 0.7376 RVE LRs: 0.5445
#sfs(x_train_scaled, Y_train, x_test_scaled, Y_test, LinearRegression())
sfsLR_out = open("appendix/sfs_LinearRegression.txt", "r")
print(sfsLR_out.read())
Best Selected Features: Index Selected Features 0 27 range_atomic_radius 1 42 wtd_mean_ElectronAffinity 2 44 wtd_gmean_ElectronAffinity 3 50 wtd_std_ElectronAffinity 4 60 wtd_std_FusionHeat 5 66 wtd_entropy_ThermalConductivity 6 69 std_ThermalConductivity 7 70 wtd_std_ThermalConductivity 8 80 wtd_std_Valence Explained Variance Score: RVE RFs: 0.9115 RVE DTs: 0.7683 RVE LRs: 0.6558
# best selected features by indice
selected_index = [15, 25, 44, 47, 49, 64, 67, 72, 74]
# column names of the best selected features
selected_names = list(x_test_scaled.columns[selected_index])
print("Best selected features by indice:", selected_index)
print("\nColumn names of the best selected features:\n", selected_names)
Best selected features by indice: [15, 25, 44, 47, 49, 64, 67, 72, 74] Column names of the best selected features: ['entropy_fie', 'entropy_atomic_radius', 'wtd_gmean_ElectronAffinity', 'range_ElectronAffinity', 'std_ElectronAffinity', 'wtd_gmean_ThermalConductivity', 'range_ThermalConductivity', 'wtd_mean_Valence', 'wtd_gmean_Valence']
In this step, we start by evaluate the explained total variance with 10 principal components:
# evaluating the explained total variance with 10 principal components
pca = PCA(n_components=10, random_state=random_state)
pca.fit(x_train_scaled)
tve = 0
for i, ve in enumerate(pca.explained_variance_ratio_):
tve += ve
print("PC%d - Variance explained: %7.4f - Total Variance: %7.4f" % (i, ve, tve))
print()
print("Actual Eigenvalues:", pca.singular_values_, "\n")
for i, comp in enumerate(pca.components_):
print("PC", i, "-->", comp, "\n")
PC0 - Variance explained: 0.3882 - Total Variance: 0.3882 PC1 - Variance explained: 0.1037 - Total Variance: 0.4919 PC2 - Variance explained: 0.0962 - Total Variance: 0.5881 PC3 - Variance explained: 0.0791 - Total Variance: 0.6672 PC4 - Variance explained: 0.0588 - Total Variance: 0.7260 PC5 - Variance explained: 0.0378 - Total Variance: 0.7637 PC6 - Variance explained: 0.0367 - Total Variance: 0.8004 PC7 - Variance explained: 0.0310 - Total Variance: 0.8315 PC8 - Variance explained: 0.0237 - Total Variance: 0.8552 PC9 - Variance explained: 0.0199 - Total Variance: 0.8751 Actual Eigenvalues: [668.26743995 345.36081648 332.64473513 301.55338751 260.02883809 208.44864431 205.42801886 188.97348227 165.1789302 151.29300197] PC 0 --> [-0.15571443 0.05202323 0.09987778 0.08345308 0.1205011 -0.14650234 -0.15640088 -0.12430574 0.08844361 -0.09850257 -0.11254083 -0.05576073 -0.1285755 -0.02210194 -0.11690234 -0.15651251 -0.10941026 -0.16405123 -0.08526872 -0.15540541 -0.16350122 0.02009301 0.11968099 0.07790363 0.1379374 -0.15329753 -0.16349529 -0.16373978 0.10061436 -0.15000212 -0.16057137 0.10924692 0.13144356 0.150542 0.15579175 -0.1215454 -0.11579583 -0.07467742 0.09452822 -0.04020672 -0.07021667 0.00738102 -0.05970996 0.05641308 -0.01208703 -0.14016098 -0.09566763 -0.10431861 -0.0686458 -0.09193077 -0.10271663 0.09452953 0.09795477 0.11525258 0.11533882 -0.13409699 -0.1432594 0.00301621 0.07725744 0.02049498 0.01044684 -0.05087482 -0.02764379 0.11012691 0.10800042 -0.0338688 0.00097751 -0.14292088 -0.05922646 -0.13392186 -0.1384527 0.13988001 0.14496576 0.14124259 0.14699077 -0.15872666 -0.15610325 -0.02557793 0.10859006 -0.00801483 0.00431711] PC 1 --> [-0.10007715 -0.22568311 -0.20193765 -0.21389395 -0.18446934 -0.12885116 -0.08866039 -0.08872597 -0.1206912 -0.07648633 -0.059769 0.16800255 0.12702963 0.16424139 0.1420496 -0.09967306 -0.09912997 0.02705894 0.05187988 0.06100689 0.037385 -0.21440163 -0.18222633 -0.22417727 -0.15696562 -0.11415286 -0.0757934 0.02087917 -0.06621921 0.07194807 0.04761352 -0.16802647 -0.14993013 -0.11522841 -0.11098527 -0.1461729 -0.13319516 -0.09627511 -0.10152626 -0.08054333 -0.0675886 0.09025652 0.08281791 0.07175037 0.0869295 -0.10206557 -0.09316122 0.03910292 0.06216073 0.05278969 0.02602499 0.12406961 0.13050566 0.05704523 0.09253688 -0.17324491 -0.13510352 0.16044891 0.13849386 0.18002205 0.1725903 -0.01183736 -0.05886364 0.01147741 0.00075423 -0.11703762 -0.07450584 -0.05390645 -0.05749378 -0.03899042 -0.05805233 -0.0368641 -0.01465001 -0.04208152 -0.01711788 -0.09841921 -0.08972819 -0.00544381 -0.0195664 0.00507352 0.00118976] PC 2 --> [-0.06838743 0.02530786 -0.02606592 0.05655384 -0.00249176 -0.04953216 -0.04147472 -0.09543285 -0.03549468 -0.10658653 -0.09150673 -0.11903039 0.05979289 -0.13920146 0.04887637 -0.08879438 -0.19952029 -0.00564617 0.14402891 0.0033996 0.05052261 0.12888984 -0.03955558 0.09567676 -0.05210017 -0.08530773 -0.06875933 0.01193231 -0.00853514 0.00580134 0.02731738 -0.04994861 -0.07679138 -0.0385445 -0.05089352 -0.07240113 -0.13340495 -0.13163674 -0.04894742 -0.14514685 -0.11782161 -0.11813176 -0.00583056 -0.0762996 0.02744861 -0.08429804 -0.18590506 -0.14277375 0.05034774 -0.13012262 -0.12391992 -0.15843579 -0.15452161 -0.11821946 -0.1298288 -0.03992937 -0.06543496 -0.20759388 -0.11421038 -0.19814035 -0.20498357 0.07164688 0.02432294 -0.05457249 -0.06797859 -0.20016543 -0.24322536 0.1099358 0.08182127 0.12550524 0.11632539 -0.11339568 -0.12067322 -0.08838736 -0.10282035 -0.064449 -0.09667551 -0.22710219 -0.04418331 -0.21173312 -0.23224443] PC 3 --> [-0.05234697 0.18520529 0.15294259 0.15102569 0.11290981 -0.05640258 -0.04404021 0.07584158 0.1185149 0.10354341 0.11942657 0.15444658 0.13833225 0.14925564 0.14762264 -0.05154492 -0.06871556 0.04955978 0.08029329 0.07080011 0.06936597 -0.02720696 -0.05669926 -0.04249869 -0.05902621 -0.05537858 -0.02922239 0.00156612 -0.01511607 0.01627145 0.02888436 0.14884523 0.11133038 0.06283543 0.05482899 -0.10896707 -0.07721367 0.1484785 0.10580346 0.17307816 0.17708532 0.26961394 0.27291024 0.23170639 0.26508963 -0.01921644 -0.03375821 0.12068656 0.19778374 0.14162756 0.12323819 -0.0884086 -0.09526313 -0.07148352 -0.08834981 -0.06714433 -0.03869833 -0.11282648 -0.08232582 -0.1030554 -0.09478307 -0.02944106 -0.06566957 -0.09822199 -0.07935789 -0.18170536 -0.15394783 0.01701873 -0.00247522 0.03393499 0.01199676 0.10914606 0.10781428 0.10148956 0.10573997 -0.04891729 -0.04082847 0.02810765 0.06404954 0.04332808 0.02785903] PC 4 --> [ 0.01153354 0.0286572 -0.03341355 -0.00792691 -0.04706446 0.00155438 0.00281518 0.08438901 0.0400745 0.10375905 0.06502987 -0.09860236 0.03481754 -0.09906735 0.04297635 0.02011398 -0.07807857 -0.0214873 0.14074166 -0.01924294 -0.00668973 0.12288563 -0.020871 0.08824059 -0.02858658 0.01968752 -0.00411033 0.05889615 0.05285952 0.06248234 0.04749301 0.09426437 0.06703653 0.05312411 0.04607159 0.04440118 -0.0267611 0.13741264 0.1372824 0.14172099 0.14029016 -0.00189774 0.10921561 0.00901535 0.12321258 0.01951418 -0.08793272 -0.06441515 0.15272889 -0.04102058 -0.05019216 0.19171843 0.19347346 0.15912986 0.18104983 0.04277547 -0.00633029 0.15790531 0.19635852 0.16491135 0.14933063 0.29118832 0.29858998 0.19776006 0.2155436 0.03514672 -0.07437284 0.14387047 0.26512451 0.15556816 0.14807464 -0.02109287 -0.02880891 -0.00089416 -0.0108651 0.03389515 -0.01721428 -0.14030199 0.06194528 -0.12372821 -0.16572588] PC 5 --> [ 0.05984263 0.05359979 -0.06812848 0.08872224 -0.03750603 0.12878478 0.07801557 -0.06105453 -0.02634693 -0.08513599 -0.08430604 -0.20329896 0.11952999 -0.20782112 0.13264291 0.08225211 -0.15262662 -0.04546081 0.28331027 -0.05865748 0.01105114 0.16245507 -0.14085904 0.1615566 -0.11748153 0.09267866 0.0538085 0.00545232 0.01438212 -0.00041682 0.02742979 0.00358279 -0.07392536 0.06181742 0.00496169 0.17993007 0.07988808 -0.1024617 -0.04520671 -0.13155811 -0.11030983 -0.01576622 0.15675982 0.03719305 0.19569687 0.1288531 -0.10433029 -0.05306374 0.24896833 -0.06449003 -0.04876823 0.03179855 0.05466549 0.02274109 0.05129726 0.10040539 0.02451817 0.05879587 0.11948542 0.05455084 0.05100271 -0.14629919 -0.27892861 -0.05718436 -0.12955221 0.15556646 0.07298191 -0.05976092 -0.2094564 -0.08219321 -0.10812697 0.08223003 0.09462144 0.07748877 0.09769753 0.07485915 -0.0502952 0.07513896 0.21778247 0.07184818 0.01819969] PC 6 --> [-0.05647019 0.06788129 -0.02099803 -0.03121123 -0.09035351 -0.12668067 -0.06193322 0.20680227 0.04943502 0.25885852 0.2411621 -0.04261144 0.08223076 -0.0582448 0.08149477 -0.05755978 -0.11545303 0.00782242 0.10952166 0.02802525 0.05169982 0.1030036 -0.06126197 0.06729284 -0.06921997 -0.06261488 -0.01372363 0.04511546 -0.04622535 0.06997993 0.0737028 0.05850626 -0.01458562 -0.03857373 -0.05985665 -0.14195272 -0.08621206 0.25374967 0.04313465 0.28449676 0.25355476 -0.27065586 -0.19459403 -0.21508942 -0.17035547 -0.07244016 -0.11256299 -0.16128121 -0.07313936 -0.16394299 -0.138924 0.0206198 0.01163678 0.01138033 0.00450602 -0.07606763 -0.04599592 0.01353014 0.02090399 0.02134712 0.02862534 -0.13067795 -0.19871789 -0.05272286 -0.07404246 -0.00634218 0.03980436 -0.0783434 -0.18041549 -0.08251652 -0.10479022 -0.04128462 -0.05400736 -0.05366641 -0.06212207 -0.06512901 -0.05813585 0.06743291 -0.01790886 0.08490599 0.07194373] PC 7 --> [ 2.43445348e-02 -1.32946979e-01 -3.61378326e-03 -1.19263816e-01 9.06083756e-03 4.66308332e-02 -1.12819812e-01 -2.63322399e-02 2.85433762e-01 -2.94282981e-02 -6.94920270e-02 -1.40713397e-02 -4.73760974e-02 -4.30406386e-02 -5.42618419e-02 5.19393153e-02 -3.92456098e-02 6.24151774e-02 1.54220968e-01 8.03968704e-02 -9.44999888e-05 -3.97700339e-02 5.64254232e-02 -4.52062713e-02 5.54131314e-02 5.23243505e-02 -8.71391291e-02 1.95011067e-02 3.29100018e-01 3.65556035e-02 -2.55411663e-02 -1.24463570e-01 -1.18669755e-03 -1.02992029e-01 -1.17456733e-02 -4.62606860e-03 -1.80671752e-01 -9.03117231e-03 2.79036461e-01 -4.75238688e-03 -6.85629603e-02 -7.03895984e-02 -4.61301570e-02 -1.24706510e-01 -6.89698952e-02 1.42619247e-02 -8.04765897e-02 1.08131454e-01 1.56986072e-01 1.08410733e-01 3.28293433e-02 -1.50585152e-01 -7.02253925e-02 -1.83477726e-01 -7.48617409e-02 -2.03485309e-03 -1.22832079e-01 -2.34992422e-02 1.03060621e-01 -1.57901767e-02 -6.84359870e-02 4.27351000e-02 8.68207832e-02 2.00681202e-02 8.21523777e-02 5.95189338e-02 -3.53800555e-02 8.89874476e-03 1.51804481e-01 1.42850753e-02 -1.19278177e-02 -1.06659333e-01 -2.19435117e-02 -1.44930485e-01 -4.38140848e-02 2.31822251e-02 -1.04045229e-01 2.52338486e-01 3.15267154e-01 2.69492297e-01 1.96575521e-01] PC 8 --> [-0.05754298 -0.06469633 -0.12481454 -0.04700495 -0.10056365 -0.05116762 0.04489517 -0.02678454 -0.24379782 -0.02525947 -0.01720463 -0.18221097 -0.01972601 -0.16853528 -0.00854283 -0.06657683 -0.05074627 -0.11191095 -0.04566042 -0.11564667 -0.06609704 0.0380265 -0.07325054 0.08598266 -0.03207774 -0.05095323 0.04010354 -0.09617416 -0.25560137 -0.1034547 -0.07674733 -0.01882791 -0.0850737 -0.00209463 -0.03960152 -0.07388185 0.04563768 0.08667524 -0.19545384 0.0963873 0.07971411 0.07055744 0.11245138 0.07246184 0.08646805 -0.06099122 -0.08536765 -0.04497264 0.06957453 -0.03117536 0.04093008 -0.07088645 -0.06828463 -0.01350569 -0.03311193 -0.01602122 0.04082056 -0.13907726 -0.10851897 -0.1299767 -0.1149111 0.24032716 0.14357995 0.20855297 0.17880969 0.00248776 0.08506109 0.03999229 0.03374312 0.05145171 0.0209148 0.02874876 0.00814253 -0.01759639 -0.02200105 -0.10754248 -0.03261032 0.2997764 -0.08599204 0.33384645 0.30637474] PC 9 --> [-0.07028766 0.09096106 0.06942911 0.05122535 0.04723735 -0.06851778 0.00723323 0.04222472 -0.07584268 0.07779925 0.07697211 -0.20746101 -0.17474243 -0.26558065 -0.20185088 -0.06449037 0.08407146 0.05673614 -0.16907842 0.09245452 0.00348355 0.2485373 0.16230696 0.14983295 0.12051236 -0.07342501 0.00619989 0.15455956 -0.08964042 0.22780302 0.10966065 -0.13484475 -0.04019354 -0.08594494 -0.04477976 -0.04352347 -0.02269355 -0.07606402 -0.07302526 -0.08040577 -0.01470477 0.07562057 0.07429282 0.11975716 0.08594841 0.02120252 0.06548466 0.00963733 0.03682496 0.02684395 0.08873318 -0.02532711 0.08419298 -0.10424798 0.04333703 -0.11373881 -0.08444405 0.13406349 0.15123959 0.15360027 0.15099229 -0.24951174 0.05365472 -0.22618574 -0.06373507 -0.19431757 -0.19044454 -0.04646457 0.15410843 -0.03186068 0.03204758 -0.04405679 -0.00061464 -0.05853376 -0.01207825 -0.06364751 0.01539113 0.07536031 -0.07796779 0.09711798 0.10857491]
In order to know how many principal components we will reduce our data into, we performed Scree plot and a Explained Variance by Components plot:
pc = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
plt.plot(pc, pca.singular_values_)
plt.title("Scree Plot", fontsize=14)
plt.xlabel('Principal component', fontsize=12)
plt.ylabel('Eigenvalues', fontsize=12)
Text(0, 0.5, 'Eigenvalues')
Analysing the Scree plot, the slope is steeper until the second component, so we must choose two principal components.
plt.figure(figsize=(8, 6))
plt.plot(pca.explained_variance_ratio_.cumsum(), marker='o', linestyle='--')
plt.title('Explained Variance by Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
Text(0, 0.5, 'Cumulative Explained Variance')
However, the principal component must explain a comulative percentage above 70%. With the Explained Variance by Components plot, we can infer that 4 components explains 72.6% of the features variance. Only 58.8% was explained with 2 principal components.
Given the results we select 4 principal analysis components.
Next, we train the PCA model with 4 principal components and apply a transformation into the x_train_scaled and x_test_scaled:
pca = PCA(n_components=4, random_state=random_state)
pca.fit(x_train_scaled)
nX_train = pca.transform(x_train_scaled)
nX_test = pca.transform(x_test_scaled)
naif_model_testing(nX_train, nX_test, Y_train.values.ravel(), Y_test.values.ravel())
RVE RFs: 0.8651 RVE DTs: 0.6630 RVE LRs: 0.5048
xtrain_pc = pd.DataFrame(nX_train)
xtrain_pc.columns = ["PC1", "PC2", "PC3", "PC4"]
xtrain_pc
| PC1 | PC2 | PC3 | PC4 | |
|---|---|---|---|---|
| 0 | 5.414926 | -2.041308 | -2.633267 | 7.741479 |
| 1 | -4.679600 | -0.326168 | 2.389260 | 0.412023 |
| 2 | 7.903280 | 0.968208 | 0.413308 | 0.031600 |
| 3 | 2.493496 | -2.655079 | -6.081691 | 2.035512 |
| 4 | -4.165676 | 0.817865 | 2.233788 | 0.452753 |
| ... | ... | ... | ... | ... |
| 14196 | 5.951361 | 2.782436 | 4.341706 | -6.968918 |
| 14197 | 5.586579 | -6.029637 | -5.473172 | 1.703294 |
| 14198 | -5.030727 | -0.618209 | 2.444568 | 0.144282 |
| 14199 | -3.227114 | 1.178488 | 2.349165 | 2.204764 |
| 14200 | 7.206284 | 3.795183 | 0.161777 | 4.266506 |
14201 rows × 4 columns
xtest_pc = pd.DataFrame(nX_test)
xtest_pc.columns = ["PC1", "PC2", "PC3", "PC4"]
xtest_pc
| PC1 | PC2 | PC3 | PC4 | |
|---|---|---|---|---|
| 0 | -5.857828 | -1.562840 | -0.668722 | -0.007666 |
| 1 | -4.177553 | -0.130448 | 1.924870 | 1.226300 |
| 2 | -1.292051 | 4.083983 | -0.614974 | 5.548708 |
| 3 | -3.686521 | 2.174282 | 3.576733 | -0.494505 |
| 4 | -6.298011 | -2.586264 | -1.067430 | -0.212216 |
| ... | ... | ... | ... | ... |
| 6991 | -4.067375 | 0.814397 | 2.607586 | 1.243962 |
| 6992 | 2.636201 | -1.326014 | -1.879953 | -4.138144 |
| 6993 | -5.405420 | -1.481888 | 1.594204 | -0.267734 |
| 6994 | 3.916529 | -1.670146 | -1.234818 | -1.505546 |
| 6995 | 9.412228 | -1.951766 | 0.041344 | -0.458290 |
6996 rows × 4 columns
After selecting the best features and reduce the dimensionality of train and test sets. We will find the best 10 models using the GridSearchCV algorithm (5 CV) with the Feature Selection Method set and the set of principal components individually, in order to conclude which method obtain the best results. The grid search will be performed with the following estimators:
For the top 1 model obtained with the set of best features and principal components, we will train the model.
We start to perform a first run that will save the results in a csv, then we read and save the csv in a variable. All the results will be merged in a final results dataframe and consequently sorted by mean_test_score (R2 - coefficient of determination).
# finding the best model using feature selection method
X = x_train_scaled[selected_names]
Y = Y_train.values.ravel()
X
| entropy_fie | entropy_atomic_radius | wtd_gmean_ElectronAffinity | range_ElectronAffinity | std_ElectronAffinity | wtd_gmean_ThermalConductivity | range_ThermalConductivity | wtd_mean_Valence | wtd_gmean_Valence | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.539547 | -0.490985 | 3.242455 | -0.836299 | -0.902095 | -0.370198 | 0.918076 | 2.279004 | 2.328312 |
| 1 | 0.617919 | 0.613529 | 0.465773 | 0.106084 | 0.054989 | -0.650682 | 0.936842 | -0.890798 | -0.836259 |
| 2 | -1.606182 | -1.592084 | -0.287533 | -1.646492 | -1.695309 | 0.865147 | -1.454865 | 1.024083 | 1.101170 |
| 3 | 0.191103 | 0.174794 | -0.309378 | 0.059178 | 0.001913 | 0.620059 | -0.710165 | 0.989130 | 0.874163 |
| 4 | 0.654406 | 0.664978 | 0.760173 | 0.344025 | 0.409682 | -0.662050 | 0.936842 | -0.752523 | -0.710930 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14196 | -1.616962 | -1.555417 | -1.921991 | -1.336060 | -1.275729 | 4.316540 | -1.114070 | -0.464910 | -0.429188 |
| 14197 | 0.201414 | 0.307522 | 0.799781 | 0.214394 | 0.206587 | 1.565527 | -0.981539 | 2.072669 | 2.168658 |
| 14198 | 0.621797 | 0.623916 | 0.452080 | 0.258230 | 0.274346 | -0.645890 | 0.936842 | -0.902438 | -0.842973 |
| 14199 | 0.026302 | -0.005710 | 0.992737 | -0.508810 | -0.381553 | -0.662523 | 0.936842 | -0.710579 | -0.677101 |
| 14200 | -1.599816 | -1.543243 | 1.964612 | -0.147208 | 0.331127 | -0.448631 | -1.290778 | 2.051697 | 2.146972 |
14201 rows × 9 columns
Y
array([ 2.41 , 75. , 24.761, ..., 60. , 15. , 4.4 ])
# Linear Regression grid search - FIRST RUN ONLY
'''
key = 'Linear Regression'
algorithm = LinearRegression()
params = [{}]
print(f'Now testing {key}')
linearResult = best_model(X, Y, algorithm, params, cv=5)
linearResult = result_to_file(key, linearResult, 'appendix/linear-result-fselection.csv')
'''
"\nkey = 'Linear Regression'\nalgorithm = LinearRegression()\nparams = [{}]\nprint(f'Now testing {key}')\nlinearResult = best_model(X, Y, algorithm, params, cv=5)\nlinearResult = result_to_file(key, linearResult, 'appendix/linear-result-fselection.csv')\n"
# Decision Tree Regressor grid search - FIRST RUN ONLY
'''
key = 'Decision Tree Regressor'
algorithm = DecisionTreeRegressor()
params = [
{
'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
'splitter': ['best', 'random'],
'max_depth': [None, 2, 3, 4, 5],
'min_samples_split': [2, 3, 4, 5, 6],
'max_leaf_nodes': [None, 2, 3, 4, 5],
'random_state': [random_state],
}
]
print(f'Now testing {key}')
decisionTreeResult = best_model(X, Y, algorithm, params, cv=5)
decisionTreeResult = result_to_file(key, decisionTreeResult, 'appendix/decision-tree-result-fselection.csv')
'''
"\nkey = 'Decision Tree Regressor'\nalgorithm = DecisionTreeRegressor()\nparams = [\n {\n 'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],\n 'splitter': ['best', 'random'],\n 'max_depth': [None, 2, 3, 4, 5],\n 'min_samples_split': [2, 3, 4, 5, 6],\n 'max_leaf_nodes': [None, 2, 3, 4, 5],\n 'random_state': [random_state],\n }\n]\nprint(f'Now testing {key}')\ndecisionTreeResult = best_model(X, Y, algorithm, params, cv=5)\ndecisionTreeResult = result_to_file(key, decisionTreeResult, 'appendix/decision-tree-result-fselection.csv')\n"
# Random Forest Regressor grid search - FIRST RUN ONLY
'''
key = 'Random Forest Regressor'
algorithm = RandomForestRegressor()
params = [
{
'n_estimators': [100, 500, 1000],
'max_depth': [2, 3, 4, 5],
'min_samples_split': [2, 3, 4],
'max_leaf_nodes': [None, 2, 3, 4],
'random_state': [random_state]
}
]
print(f'Now testing {key}')
random_forest_result = best_model(X, Y, algorithm, params, cv=5)
random_forest_result = result_to_file(key, random_forest_result, 'appendix/random-forest-result-fselection.csv')
'''
"\nkey = 'Random Forest Regressor'\nalgorithm = RandomForestRegressor()\nparams = [\n {\n 'n_estimators': [100, 500, 1000],\n 'max_depth': [2, 3, 4, 5],\n 'min_samples_split': [2, 3, 4],\n 'max_leaf_nodes': [None, 2, 3, 4],\n 'random_state': [random_state]\n }\n]\nprint(f'Now testing {key}')\nrandom_forest_result = best_model(X, Y, algorithm, params, cv=5)\nrandom_forest_result = result_to_file(key, random_forest_result, 'appendix/random-forest-result-fselection.csv')\n"
# Ada Boost with Decision Tree Regressor grid search - FIRST RUN ONLY
'''
key = 'Ada Boost with Decision Tree Regressor'
algorithm = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())
params = [
{
'n_estimators': [100, 500, 1000],
'learning_rate': [0.2, 0.5, 0.8, 1],
'loss': ['linear', 'square', 'exponential'],
'random_state': [random_state]
}
]
print(f'Now testing {key}')
ada_boost_dt_result = best_model(X, Y, algorithm, params, cv=5)
ada_boost_dt_result = result_to_file(key, ada_boost_dt_result, 'appendix/ada-boost-dt-fselection.csv')
'''
"\nkey = 'Ada Boost with Decision Tree Regressor'\nalgorithm = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())\nparams = [\n {\n 'n_estimators': [100, 500, 1000],\n 'learning_rate': [0.2, 0.5, 0.8, 1],\n 'loss': ['linear', 'square', 'exponential'],\n 'random_state': [random_state]\n }\n]\nprint(f'Now testing {key}')\nada_boost_dt_result = best_model(X, Y, algorithm, params, cv=5)\nada_boost_dt_result = result_to_file(key, ada_boost_dt_result, 'appendix/ada-boost-dt-fselection.csv')\n"
# Ada Boost with Linear Regression grid search - FIRST RUN ONLY
'''
key = 'Ada Boost with Linear Regression'
algorithm = AdaBoostRegressor(base_estimator=LinearRegression())
params = [
{
'n_estimators': [100, 500, 1000],
'learning_rate': [0.2, 0.5, 0.8, 1],
'loss': ['linear', 'square', 'exponential'],
'random_state': [random_state]
}
]
print(f'Now testing {key}')
ada_boost_lr_result = best_model(X, Y, algorithm, params, cv=5)
ada_boost_lr_result = result_to_file(key, ada_boost_lr_result, 'appendix/ada-boost-lr-fselection.csv')
'''
"\nkey = 'Ada Boost with Linear Regression'\nalgorithm = AdaBoostRegressor(base_estimator=LinearRegression())\nparams = [\n {\n 'n_estimators': [100, 500, 1000],\n 'learning_rate': [0.2, 0.5, 0.8, 1],\n 'loss': ['linear', 'square', 'exponential'],\n 'random_state': [random_state]\n }\n]\nprint(f'Now testing {key}')\nada_boost_lr_result = best_model(X, Y, algorithm, params, cv=5)\nada_boost_lr_result = result_to_file(key, ada_boost_lr_result, 'appendix/ada-boost-lr-fselection.csv')\n"
# Gradient Boosting Regressor grid search - FIRST RUN ONLY
'''
key = 'Gradient Boosting Regressor'
algorithm = GradientBoostingRegressor()
params = [
{
'n_estimators': [100, 500, 1000],
'min_samples_split': [2, 3, 4, 5],
'max_depth': [2, 3, 4, 5],
'random_state': [random_state]
}
]
print(f'Now testing {key}')
gradient_boosting_result = best_model(X, Y, algorithm, params, cv=5)
gradient_boosting_result = result_to_file(key, gradient_boosting_result, 'appendix/gradient-boost-fselection.csv')
'''
"\nkey = 'Gradient Boosting Regressor'\nalgorithm = GradientBoostingRegressor()\nparams = [\n {\n 'n_estimators': [100, 500, 1000],\n 'min_samples_split': [2, 3, 4, 5],\n 'max_depth': [2, 3, 4, 5],\n 'random_state': [random_state]\n }\n]\nprint(f'Now testing {key}')\ngradient_boosting_result = best_model(X, Y, algorithm, params, cv=5)\ngradient_boosting_result = result_to_file(key, gradient_boosting_result, 'appendix/gradient-boost-fselection.csv')\n"
# reading the result output file of each estimator, and tranform into dataframe
LR_result_fs = pd.DataFrame(pd.read_csv("appendix/linear-result-fselection.csv", sep=",")).drop(columns='Unnamed: 0')
DT_result_fs = pd.DataFrame(pd.read_csv("appendix/decision-tree-result-fselection.csv", sep=",")).drop(columns='Unnamed: 0')
RF_result_fs = pd.DataFrame(pd.read_csv("appendix/random-forest-result-fselection.csv", sep=",")).drop(columns='Unnamed: 0')
Ada_LR_result_fs = pd.DataFrame(pd.read_csv("appendix/ada-boost-lr-fselection.csv", sep=",")).drop(columns='Unnamed: 0')
Ada_DT_result_fs = pd.DataFrame(pd.read_csv("appendix/ada-boost-dt-fselection.csv", sep=",")).drop(columns='Unnamed: 0')
Gradient_result_fs = pd.DataFrame(pd.read_csv("appendix/gradient-boost-fselection.csv", sep=",")).drop(columns='Unnamed: 0')
# merging all in a final results dataframe
final_results_fs= LR_result_fs.append([LR_result_fs, DT_result_fs, RF_result_fs, Ada_LR_result_fs, Ada_DT_result_fs, Gradient_result_fs], ignore_index=True)
# sorting by mean_test_score - descending
final_results_fs_sorted = final_results_fs.sort_values(by="mean_test_score", ascending=False)
# selecting the top 10 models with the best mean_test_score
final_results_fs_top10 = final_results_fs_sorted.head(10)
print("\nTop 10 best models with feature selection:\n")
final_results_fs_top10
Top 10 best models with feature selection:
| Algorithm | params | mean_test_score | |
|---|---|---|---|
| 1265 | Gradient Boosting Regressor | {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 1000, 'random_state': 42} | 0.899196 |
| 1259 | Gradient Boosting Regressor | {'max_depth': 5, 'min_samples_split': 3, 'n_estimators': 1000, 'random_state': 42} | 0.899122 |
| 1262 | Gradient Boosting Regressor | {'max_depth': 5, 'min_samples_split': 4, 'n_estimators': 1000, 'random_state': 42} | 0.898333 |
| 1256 | Gradient Boosting Regressor | {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 1000, 'random_state': 42} | 0.897450 |
| 1185 | Ada Boost with Decision Tree Regressor | {'learning_rate': 0.2, 'loss': 'square', 'n_estimators': 100, 'random_state': 42} | 0.896907 |
| 1258 | Gradient Boosting Regressor | {'max_depth': 5, 'min_samples_split': 3, 'n_estimators': 500, 'random_state': 42} | 0.896711 |
| 1264 | Gradient Boosting Regressor | {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 500, 'random_state': 42} | 0.896620 |
| 1261 | Gradient Boosting Regressor | {'max_depth': 5, 'min_samples_split': 4, 'n_estimators': 500, 'random_state': 42} | 0.895696 |
| 1255 | Gradient Boosting Regressor | {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 500, 'random_state': 42} | 0.894743 |
| 1188 | Ada Boost with Decision Tree Regressor | {'learning_rate': 0.2, 'loss': 'exponential', 'n_estimators': 100, 'random_state': 42} | 0.894579 |
# finding the best model using PCA method
X = xtrain_pc
Y = Y_train.values.ravel()
# Linear Regression grid search - FIRST RUN ONLY
'''
key = 'Linear Regression'
algorithm = LinearRegression()
params = [{}]
print(f'Now testing {key}')
linearResult = best_model(X, Y, algorithm, params, cv=5)
linearResult = result_to_file(key, linearResult, 'appendix/linear-result-PCA.csv')
'''
"\nkey = 'Linear Regression'\nalgorithm = LinearRegression()\nparams = [{}]\nprint(f'Now testing {key}')\nlinearResult = best_model(X, Y, algorithm, params, cv=5)\nlinearResult = result_to_file(key, linearResult, 'appendix/linear-result-PCA.csv')\n"
# Decision Tree Regressor grid search - FIRST RUN ONLY
'''
key = 'Decision Tree Regressor'
algorithm = DecisionTreeRegressor()
params = [
{
'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
'splitter': ['best', 'random'],
'max_depth': [None, 2, 3, 4, 5],
'min_samples_split': [2, 3, 4, 5, 6],
'max_leaf_nodes': [None, 2, 3, 4, 5],
'random_state': [random_state],
}
]
print(f'Now testing {key}')
decisionTreeResult = best_model(X, Y, algorithm, params, cv=5)
decisionTreeResult = result_to_file(key, decisionTreeResult, 'appendix/decision-tree-result-PCA.csv')
'''
"\nkey = 'Decision Tree Regressor'\nalgorithm = DecisionTreeRegressor()\nparams = [\n {\n 'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],\n 'splitter': ['best', 'random'],\n 'max_depth': [None, 2, 3, 4, 5],\n 'min_samples_split': [2, 3, 4, 5, 6],\n 'max_leaf_nodes': [None, 2, 3, 4, 5],\n 'random_state': [random_state],\n }\n]\nprint(f'Now testing {key}')\ndecisionTreeResult = best_model(X, Y, algorithm, params, cv=5)\ndecisionTreeResult = result_to_file(key, decisionTreeResult, 'appendix/decision-tree-result-PCA.csv')\n"
# Random Forest Regressor grid search - FIRST RUN ONLY
'''
key = 'Random Forest Regressor'
algorithm = RandomForestRegressor()
params = [
{
'n_estimators': [100, 500, 1000],
'max_depth': [2, 3, 4, 5],
'min_samples_split': [2, 3, 4],
'max_leaf_nodes': [None, 2, 3, 4],
'random_state': [random_state]
}
]
print(f'Now testing {key}')
random_forest_result = best_model(X, Y, algorithm, params, cv=5)
random_forest_result = result_to_file(key, random_forest_result, 'appendix/random-forest-result-PCA.csv')
'''
"\nkey = 'Random Forest Regressor'\nalgorithm = RandomForestRegressor()\nparams = [\n {\n 'n_estimators': [100, 500, 1000],\n 'max_depth': [2, 3, 4, 5],\n 'min_samples_split': [2, 3, 4],\n 'max_leaf_nodes': [None, 2, 3, 4],\n 'random_state': [random_state]\n }\n]\nprint(f'Now testing {key}')\nrandom_forest_result = best_model(X, Y, algorithm, params, cv=5)\nrandom_forest_result = result_to_file(key, random_forest_result, 'appendix/random-forest-result-PCA.csv')\n"
# Ada Boost with Decision Tree Regressor grid search - FIRST RUN ONLY
'''
key = 'Ada Boost with Decision Tree Regressor'
algorithm = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())
params = [
{
'n_estimators': [100, 500, 1000],
'learning_rate': [0.2, 0.5, 0.8, 1],
'loss': ['linear', 'square', 'exponential'],
'random_state': [random_state]
}
]
print(f'Now testing {key}')
ada_boost_dt_result = best_model(X, Y, algorithm, params, cv=5)
ada_boost_dt_result = result_to_file(key, ada_boost_dt_result, 'appendix/ada-boost-dt-PCA.csv')
'''
"\nkey = 'Ada Boost with Decision Tree Regressor'\nalgorithm = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())\nparams = [\n {\n 'n_estimators': [100, 500, 1000],\n 'learning_rate': [0.2, 0.5, 0.8, 1],\n 'loss': ['linear', 'square', 'exponential'],\n 'random_state': [random_state]\n }\n]\nprint(f'Now testing {key}')\nada_boost_dt_result = best_model(X, Y, algorithm, params, cv=5)\nada_boost_dt_result = result_to_file(key, ada_boost_dt_result, 'appendix/ada-boost-dt-PCA.csv')\n"
# Ada Boost with Linear Regression grid search - FIRST RUN ONLY
'''
key = 'Ada Boost with Linear Regression'
algorithm = AdaBoostRegressor(base_estimator=LinearRegression())
params = [
{
'n_estimators': [100, 500, 1000],
'learning_rate': [0.2, 0.5, 0.8, 1],
'loss': ['linear', 'square', 'exponential'],
'random_state': [random_state]
}
]
print(f'Now testing {key}')
ada_boost_lr_result = best_model(X, Y, algorithm, params, cv=5)
ada_boost_lr_result = result_to_file(key, ada_boost_lr_result, 'appendix/ada-boost-lr-PCA.csv')
'''
"\nkey = 'Ada Boost with Linear Regression'\nalgorithm = AdaBoostRegressor(base_estimator=LinearRegression())\nparams = [\n {\n 'n_estimators': [100, 500, 1000],\n 'learning_rate': [0.2, 0.5, 0.8, 1],\n 'loss': ['linear', 'square', 'exponential'],\n 'random_state': [random_state]\n }\n]\nprint(f'Now testing {key}')\nada_boost_lr_result = best_model(X, Y, algorithm, params, cv=5)\nada_boost_lr_result = result_to_file(key, ada_boost_lr_result, 'appendix/ada-boost-lr-PCA.csv')\n"
# Gradient Boosting Regressor grid search - FIRST RUN ONLY
'''
key = 'Gradient Boosting Regressor'
algorithm = GradientBoostingRegressor()
params = [
{
'n_estimators': [100, 500, 1000],
'min_samples_split': [2, 3, 4, 5],
'max_depth': [2, 3, 4, 5],
'random_state': [random_state]
}
]
print(f'Now testing {key}')
gradient_boosting_result = best_model(X, Y, algorithm, params, cv=5)
gradient_boosting_result = result_to_file(key, gradient_boosting_result, 'appendix/gradient-boost-PCA.csv')
'''
"\nkey = 'Gradient Boosting Regressor'\nalgorithm = GradientBoostingRegressor()\nparams = [\n {\n 'n_estimators': [100, 500, 1000],\n 'min_samples_split': [2, 3, 4, 5],\n 'max_depth': [2, 3, 4, 5],\n 'random_state': [random_state]\n }\n]\nprint(f'Now testing {key}')\ngradient_boosting_result = best_model(X, Y, algorithm, params, cv=5)\ngradient_boosting_result = result_to_file(key, gradient_boosting_result, 'appendix/gradient-boost-PCA.csv')\n"
# reading the result output file of each estimator, and tranform into dataframe
LR_result_pca = pd.DataFrame(pd.read_csv("appendix/linear-result-PCA.csv", sep=",")).drop(columns='Unnamed: 0')
DT_result_pca = pd.DataFrame(pd.read_csv("appendix/decision-tree-result-PCA.csv", sep=",")).drop(columns='Unnamed: 0')
RF_result_pca = pd.DataFrame(pd.read_csv("appendix/random-forest-result-PCA.csv", sep=",")).drop(columns='Unnamed: 0')
Ada_LR_result_pca = pd.DataFrame(pd.read_csv("appendix/ada-boost-lr-PCA.csv", sep=",")).drop(columns='Unnamed: 0')
Ada_DT_result_pca = pd.DataFrame(pd.read_csv("appendix/ada-boost-dt-PCA.csv", sep=",")).drop(columns='Unnamed: 0')
Gradient_result_pca = pd.DataFrame(pd.read_csv("appendix/gradient-boost-PCA.csv", sep=",")).drop(columns='Unnamed: 0')
# merging all in a final results dataframe
final_results_pca = LR_result_pca.append([LR_result_pca, DT_result_pca, RF_result_pca, Ada_LR_result_pca, Ada_DT_result_pca, Gradient_result_pca], ignore_index=True)
# sorting by mean_test_score - descending
final_results_pc_sorted = final_results_pca.sort_values(by="mean_test_score", ascending=False)
# selecting the top 10 models with the best mean_test_score
final_results_pc_top10 = final_results_pc_sorted.head(10)
print("\nTop 10 best models with Principal Components:\n")
final_results_pc_top10
Top 10 best models with Principal Components:
| Algorithm | params | mean_test_score | |
|---|---|---|---|
| 1185 | Ada Boost with Decision Tree Regression | {'learning_rate': 0.2, 'loss': 'square', 'n_estimators': 100, 'random_state': 42} | 0.852275 |
| 1197 | Ada Boost with Decision Tree Regression | {'learning_rate': 0.5, 'loss': 'exponential', 'n_estimators': 100, 'random_state': 42} | 0.852186 |
| 1188 | Ada Boost with Decision Tree Regression | {'learning_rate': 0.2, 'loss': 'exponential', 'n_estimators': 100, 'random_state': 42} | 0.851817 |
| 1191 | Ada Boost with Decision Tree Regression | {'learning_rate': 0.5, 'loss': 'linear', 'n_estimators': 100, 'random_state': 42} | 0.850573 |
| 1183 | Ada Boost with Decision Tree Regression | {'learning_rate': 0.2, 'loss': 'linear', 'n_estimators': 500, 'random_state': 42} | 0.850042 |
| 1194 | Ada Boost with Decision Tree Regression | {'learning_rate': 0.5, 'loss': 'square', 'n_estimators': 100, 'random_state': 42} | 0.849272 |
| 1182 | Ada Boost with Decision Tree Regression | {'learning_rate': 0.2, 'loss': 'linear', 'n_estimators': 100, 'random_state': 42} | 0.848505 |
| 1206 | Ada Boost with Decision Tree Regression | {'learning_rate': 0.8, 'loss': 'exponential', 'n_estimators': 100, 'random_state': 42} | 0.847830 |
| 1189 | Ada Boost with Decision Tree Regression | {'learning_rate': 0.2, 'loss': 'exponential', 'n_estimators': 500, 'random_state': 42} | 0.846767 |
| 1200 | Ada Boost with Decision Tree Regression | {'learning_rate': 0.8, 'loss': 'linear', 'n_estimators': 100, 'random_state': 42} | 0.843199 |
Now, that we know which are the best models for feature selection and principal components sets, we will proceed to test the models.
Best Model for feature selection set:
final_results_fs_top10.head(1)
| Algorithm | params | mean_test_score | |
|---|---|---|---|
| 1265 | Gradient Boosting Regressor | {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 1000, 'random_state': 42} | 0.899196 |
# training the best model for features selection set
reg = GradientBoostingRegressor(max_depth=5, min_samples_split=5, n_estimators=1000, random_state=random_state)
reg = reg.fit(x_train_scaled[selected_names], Y_train.values.ravel())
reg.predict(x_test_scaled[selected_names])
R2 = reg.score(x_test_scaled[selected_names], Y_test)
print("\nR2 - Coefficient of determination of the prediction for the best features selected is: ", R2)
R2 - Coefficient of determination of the prediction for the best features selected is: 0.9056962393750775
Best Model for principal components set:
final_results_pc_top10.head(1)
| Algorithm | params | mean_test_score | |
|---|---|---|---|
| 1185 | Ada Boost with Decision Tree Regression | {'learning_rate': 0.2, 'loss': 'square', 'n_estimators': 100, 'random_state': 42} | 0.852275 |
# training the best model for principal components set
reg = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),learning_rate=0.2, loss='square', n_estimators=100, random_state=random_state)
reg = reg.fit(xtrain_pc, Y_train.values.ravel())
reg.predict(xtest_pc)
R2 = reg.score(xtest_pc, Y_test)
print("\nR2 - Coefficient of determination of the prediction for the principal components is: ", R2)
R2 - Coefficient of determination of the prediction for the principal components is: 0.8590995898302416